pax_global_header00006660000000000000000000000064123765625500014525gustar00rootroot0000000000000052 comment=4024a055bb25301464758f317980f2993af37da3 sheepdog-0.8.3/000077500000000000000000000000001237656255000133335ustar00rootroot00000000000000sheepdog-0.8.3/.gitignore000066400000000000000000000013001237656255000153150ustar00rootroot00000000000000# # Normal rules # .* *.o *.o.* *.a *.s *.ko *.so *.mod.c *.i *.lst *.symtypes *.d *.orig *.rej cscope.* *.gcda *.gcno *.info # # for GLOBAL # GTAGS GRTAGS GPATH GSYMS # # programs # dog/dog sheep/sheep sheepfs/sheepfs shepherd/shepherd tools/zk_control tests/unit/dog/test_common tests/unit/sheep/test_vdi tests/unit/sheep/test_cluster_driver # directories .deps autom4te.cache coverage aclocal.m4 Makefile Makefile.in INSTALL config.log config.status config.guess config.sub configure depcomp install-sh missing push stamp-h1 libtool ltmain.sh config.h config.h.in script/generic tests/check.log tests/check.time tests/atconfig tests/*.out.bad *.patch man/sheep.8 man/dog.8 man/sheepfs.8 *.deb sheepdog-0.8.3/CHANGELOG.md000066400000000000000000000041551237656255000151510ustar00rootroot00000000000000 ## 0.8.0 NEW FEATURE: - hyper volume: use B-tree structure to replace index-array in sd_inode so the max size of vdi could extent from 4TB to 16PB. - Erasure Code: a new redundancy scheme that uses error correction algorithm to achieves high available of data with much less storage overhead compared to complete replication - HTTP simple storage: a new interface to retrieve any amount of data with a simple web services interface. DOG COMMAND INTERFACE: - new subcommand "vdi cache purge" for cleaning stale object cache - "vdi cache purge" cleans stale cache of all images - "vdi cache purge " cleans stale cache of the specified image - new subcommand "node stat" for showing I/O status of the node - new subcommand "node loglevel" for changing log level at runtime - "node log level set" sets loglevel of running sheep process - "node log level get" gets loglevel from running sheep process - "node log level list" lists avialable loglevels - new option "-o" of "vdi track", for tracking objects with their oids - new option "-y" of "vdi create", for create hyper-volume vdi - new option "-s" of "cluster info", show backend store information - new option "-t" of "cluster format", choose not serve write request if number of nodes is not sufficient - modify option "-c" of "vdi create", we can specify "x:y" for erasure code - new subcommand "node stat" for node request satistics - "node stat -w" set watch mode for this command SHEEP COMMAND INTERFACE: - improvements of help messages - change format of the size format in -j (journaling) and -w (object cache) options. The new format is: n[TtGgMmKkb]. e.g. "-j size=1024M". - rotate log when sheep process catches SIGHUP signal - remove "-o" option for choosing stdout as an output of log - remove "-f" option for executing sheep as a foreground process - "-o" and "-f" is a same thing to "-l dst=stdout" - unified "-l" option - "-l format=..." for log format - "-l level=..." for log level - "-l dst=..." for log destination - new option '-r' to enable http service - modify option "-c" of "cluster format", we can specify "x:y" for erasure code sheepdog-0.8.3/COPYING000066400000000000000000000431101237656255000143650ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. sheepdog-0.8.3/INSTALL000066400000000000000000000045241237656255000143710ustar00rootroot00000000000000=========================== Sheepdog Installation Guide =========================== Run-time dependencies --------------------- * Three or more x86-64 machines. * The corosync and corosync lib package or zookeeper equivalent * QEMU 0.13 or later * liburcu Compile-time dependencies ------------------------- * GNU Autotools * corosync devel package * liburcu devel package * git * optional:fuse-devel (for sheepfs) * optional:libzookeeper-mt-dev (for zookeeper support) Installing from source ------------------------------------ 1. Compile or install the Corosync packages: Nearly every modern Linux distribution has x86_64 corosync binaries pre-built available via their repositories. We recommend you use these packages if they are available on your distribution. For debian package based systems: $ sudo aptitude install corosync libcorosync-dev For RPM package based systems: $ sudo yum install corosynclib-devel For EL6 (RHEL, CentOS, SL, etc), the provided version of corosync is too old and you must install corosync from source. 2. Download, build and install QEMU with Sheepdog support: QEMU 0.13 or later provides built-in support for sheepdog devices. Some distributions provide pre-built versions of this newer version of QEMU. If your distribution has an older version of QEMU or you prefer to compile from source, retrieve the latest QEMU and compile: $ git clone git://git.qemu.org/qemu.git $ cd qemu $ ./configure $ sudo make install 3. Download, build and install the Sheepdog server and command line tools: $ git clone git://github.com/sheepdog/sheepdog.git $ cd sheepdog $ ./autogen.sh $ ./configure $ sudo make install If your want to built-in sheepfs and zookeeper support, try: $ ./configure --enable-zookeeper --enable-sheepfs Please note, sheepdog supports a "make rpm" target which will generate an rpm package that can be installed on the local machine. To use this installation method, use the following instructions: At sheepdog source directory $ make rpm $ sudo rpm -ivh x86_64/sheepdog-0.* Please read the README file, the sheep(8), dog(8) or sheepfs(8) man page for further usage instructions. =============================================================================== Copyright (C) 2009-2011, Nippon Telegraph and Telephone Corporation. sheepdog-0.8.3/MAINTAINERS000066400000000000000000000034131237656255000150310ustar00rootroot00000000000000Sheepdog Maintainers ==================== The intention of this file is not to establish who owns what portions of the code base, but to provide a set of names that developers can consult when they have a question about a particular subset and also to provide a set of names to be CC'd when submitting a patch to obtain appropriate review. In general, if you have a question about inclusion of a patch, you should consult sheepdog development list and not any specific individual privately. Descriptions of section entries: M: Mail patches to: FullName L: Mailing list that is relevant to this area B: Branches with wildcard patterns F: Files and directories with wildcard patterns Sheepdog Overall ------------------------------ M: MORITA Kazutaka M: Liu Yuan B: sheeepdog/* L: sheepdog@lists.wpkg.org Sheepdog Stable Branches ------------------------------ M: Hitoshi Mitake B: sheepdog/stable-* L: sheepdog@lists.wpkg.org QEMU Driver ------------------------------ M: MORITA Kazutaka M: Liu Yuan F: qemu/block/sheedog.c L: qemu-devel@nongnu.org iSCSI Target Driver ------------------------------ M: Hitoshi Mitake F: tgt/usr/bs_sheepdog.c L: stgt@vger.kernel.org Libvirt Driver ------------------------------ M: Sebastian Wiedenroth F: libvirt/storage/storage_backend_sheepdog.c L: libvir-list@redhat.com Openstack Drivers ------------------------------ M: MORITA Kazutaka M: Liu Yuan F: openstack/glance/store/sheepdog.py F: openstack/cinder/volume/drivers/sheepdog.py L: openstack-dev@lists.openstack.org sheepdog-0.8.3/Makefile.am000066400000000000000000000071721237656255000153760ustar00rootroot00000000000000SPEC = $(PACKAGE_NAME).spec TARFILE = $(PACKAGE_NAME)-$(VERSION).tar.gz EXTRA_DIST = autogen.sh AUTOMAKE_OPTIONS = foreign MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure depcomp \ config.guess config.sub missing install-sh \ autoheader automake autoconf config.status \ config.log sheepdogsysconfdir = ${SHEEPDOGCONFDIR} sheepdogsysconf_DATA = SUBDIRS = lib dog sheep include script shepherd tools if BUILD_SHEEPFS SUBDIRS += sheepfs endif SUBDIRS += man if BUILD_UNITTEST SUBDIRS += tests/unit endif install-exec-local: $(INSTALL) -d $(DESTDIR)/${localstatedir}/lib/sheepdog uninstall-local: rmdir $(DESTDIR)/${localstatedir}/lib/sheepdog || :; dist-clean-local: rm -f autoconf automake autoheader clean-generic: rm -rf $(SPEC) $(TARFILE) cscope* find -name '*.orig' -or -name '*.rej' | xargs rm -f find -name '*.gcno' -or -name '*.gcda' -or -name '*.info' | xargs rm -f cscope: @echo create cscope.out @find -name '*.[chS]' > cscope.files @cscope -bq $(SPEC): $(SPEC).in rm -f $@-t $@ LC_ALL=C date="$(shell date "+%a %b %d %Y")" && \ sed \ -e "s#@version@#$(VERSION)#g" \ -e "s#@date@#$$date#g" \ $< > $@-t chmod a-w $@-t mv $@-t $@ RPMBUILDOPTS = --define "_sourcedir $(abs_builddir)" \ --define "_specdir $(abs_builddir)" \ --define "_builddir $(abs_builddir)" \ --define "_srcrpmdir $(abs_builddir)" \ --define "_rpmdir $(abs_builddir)" RPMBUILD_CONFIG_OPTS = RPMBUILD_REQ_OPTS = RPMBUILD_BUILD_REQ_OPTS = if BUILD_ZOOKEEPER RPMBUILD_CONFIG_OPTS += --enable-zookeeper RPMBUILD_REQ_OPTS += zookeeper RPMBUILD_BUILD_REQ_OPTS += zookeeper-lib-devel endif if BUILD_SHEEPFS RPMBUILD_CONFIG_OPTS += --enable-sheepfs RPMBUILD_REQ_OPTS += fuse RPMBUILD_BUILD_REQ_OPTS += fuse-devel endif RPMBUILDOPTS += --define "_configopts $(RPMBUILD_CONFIG_OPTS)" RPMBUILDOPTS += --define "_requires $(RPMBUILD_REQ_OPTS)" RPMBUILDOPTS += --define "_buildrequires $(RPMBUILD_BUILD_REQ_OPTS)" $(TARFILE): $(MAKE) dist srpm: clean $(MAKE) $(SPEC) $(TARFILE) rpmbuild $(RPMBUILDOPTS) --nodeps -bs $(SPEC) rpm: clean $(MAKE) $(SPEC) $(TARFILE) rpmbuild $(RPMBUILDOPTS) -ba $(SPEC) deb: fakeroot ./debian/rules clean @if [ -d .git ]; then \ git log > debian/CHANGELOG; \ else \ echo "This package doesn't contain CHANGELOG because it is build from raw source archive." > debian/CHANGELOG; \ fi rm -f debian/changelog dch -v $(shell echo $(PACKAGE_VERSION) | sed s/_/+/ | sed s/_/./g)-1 \ --package sheepdog --create 'Local build' fakeroot ./debian/rules binary CGCC=cgcc CGCC_CFLAGS=-Wbitwise -Wno-return-void $(ARCH) -fno-common sparse: ARCH=$(shell sh script/checkarch.sh) sparse: $(MAKE) CC=$(CGCC) CFLAGS="$(CFLAGS) $(CGCC_CFLAGS)" CHECK_STYLE=../script/checkpatch.pl -f --no-summary --terse check-style: @for dir in lib dog sheep include sheepfs; do \ make -C $$dir check-style CHECK_STYLE="$(CHECK_STYLE)"; \ done check-unused: @find -name '*.o' -exec nm -o {} \; | grep -v '^./lib' | grep ' U ' | \ awk '{print $$3;}' | sort -u > /tmp/sd_used @find -name '*.o' -exec nm -o {} \; | grep -v '^./lib' | grep ' T ' | \ awk '{print $$3;}' > /tmp/sd_defined @while read func;do if ! grep -Fq $$func /tmp/sd_used;then \ echo $$func; fi; done < /tmp/sd_defined if BUILD_COVERAGE coverage: clean check @rm -rf coverage @for dir in dog sheep tests/unit/dog tests/unit/sheep ; do\ $(MAKE) -C $$dir coverage; \ done @lcov -a dog/dog.info -a sheep/sheep.info \ -a tests/unit/dog/dog.info -a tests/unit/sheep/sheep.info \ -o sheep.info && \ lcov -r sheep.info /usr/include/\* -o sheep.info && \ lcov -r sheep.info tests/unit/\* -o sheep.info && \ genhtml sheep.info -o coverage endif sheepdog-0.8.3/README000066400000000000000000000157331237656255000142240ustar00rootroot00000000000000Sheepdog: Distributed Storage System for KVM ============================================ Overview -------- Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. Sheepdog is an Open Source software, released under the terms of the GNU General Public License version 2. For the latest information about Sheepdog, please visit our website at: http://sheepdog.github.io/sheepdog/ And (recommend for new comers) wiki at: https://github.com/sheepdog/sheepdog/wiki/ Requirements ------------ * Three or more x86-64 machines * Corosync cluster engine Install ------- Please read the INSTALL file distributed with this package for detailed instructions on installing or compiling from source. Usage ----- * Cluster Management Backends Sheepdog uses a cluster management backend to manage membership and broadcast messages to the cluster nodes. For now, sheepdog can use local driver (for development on a single box), corosync (the default), zookeeper and Accord. * Local Driver This driver just makes use of UNIX IPC mechanism to manage the membership on a single box, where we start multiple 'sheep' processes to simulate the cluster. It is very easy and fast setup and especially useful to test functionality without involving any other software. To set up a 3 node cluster using local driver in one liner bash with debug mode: $ mkdir /path/to/store $ for i in 0 1 2; do sheep -c local -d /path/to/store/$i -z $i -p 700$i;sleep 1;done * Configure corosync. Nearly every modern Linux distribution has x86_64 corosync binaries pre-built available via their repositories. We recommend you use these packages if they are available on your distribution. For debian package based systems: $ sudo aptitude install corosync libcorosync-dev For RPM package based systems: $ sudo yum install corosynclib-devel Reference our wiki, the corosync(8) and corosync.conf(5) man page for further details. * Setup Sheepdog 1. Launch sheepdog on each machines of the cluster. $ sheep /store_dir Notes: /store_dir is a directory to store objects. The directory must be on the filesystem with an xattr support. In case of ext3, you need to add 'user_xattr' to the mount options. $ sudo mount -o remount,user_xattr /store_device 2. Make fs $ dog cluster format --copies=3 --copies specifies the number of default data redundancy. In this case, the replicated data is stored on three machines. 3. Check cluster state Following list shows that Sheepdog is running on 32 nodes. $ dog node list Idx Node id (FNV-1a) - Host:Port ------------------------------------------------ 0 0308164db75cff7e - 10.68.13.15:7000 * 1 03104d8b4315c8e4 - 10.68.13.1:7000 2 0ab18c565bc14aea - 10.68.13.3:7000 3 0c0d27f0ac395f5d - 10.68.13.16:7000 4 127ee4802991f308 - 10.68.13.13:7000 5 135ff2beab2a9809 - 10.68.14.5:7000 6 17bd6240eab65870 - 10.68.14.4:7000 7 1cf35757cbf47d7b - 10.68.13.10:7000 8 1df9580b8960a992 - 10.68.13.11:7000 9 29307d3fa5a04f78 - 10.68.14.12:7000 10 29dcb3474e31d4f3 - 10.68.14.15:7000 11 29e089c98dd2a144 - 10.68.14.16:7000 12 2a118b7e2738f479 - 10.68.13.4:7000 13 3d6aea26ba79d75f - 10.68.13.6:7000 14 42f9444ead801767 - 10.68.14.11:7000 15 562c6f38283d09fe - 10.68.14.2:7000 16 5dd5e540cca1556a - 10.68.14.6:7000 17 6c12a5d10f10e291 - 10.68.14.13:7000 18 6dae1d955ca72d96 - 10.68.13.7:7000 19 711db0f5fa40b412 - 10.68.14.14:7000 20 7c6b95212ee7c085 - 10.68.14.9:7000 21 7d010c31bf11df73 - 10.68.13.2:7000 22 82c43e908b1f3f01 - 10.68.13.12:7000 23 931d2de0aaf61cf5 - 10.68.13.8:7000 24 961d9d391e6021e7 - 10.68.13.14:7000 25 9a3ef6fa1081026c - 10.68.13.9:7000 26 b0b3d300fed8bc26 - 10.68.14.10:7000 27 b0f08fb98c8f5edc - 10.68.14.8:7000 28 b9cc316dc5aba880 - 10.68.13.5:7000 29 d9eda1ec29c2eeeb - 10.68.14.7:7000 30 e53cebb2617c86fd - 10.68.14.1:7000 31 ea46913c4999ccdf - 10.68.14.3:7000 * Create a virtual machine image 1. Create a 256 GB virtual machine image of Alice. $ qemu-img create sheepdog:Alice 256G 2. You can also convert from existing KVM images to Sheepdog ones. $ qemu-img convert ~/amd64.raw sheepdog:Bob 3. See Sheepdog images by the following command. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 * Boot the virtual machine 1. Boot the virtual machine. $ qemu-system-x86_64 -hda sheepdog:Alice 2. Following command checks used images. $ dog vm list Name |Vdi size |Allocated| Shared | Status ----------------+---------+---------+---------+------------ Bob | 2.0 GB| 1.6 GB| 0.0 MB| running on xx.xx.xx.xx Alice | 256 GB| 0.0 MB| 0.0 MB| not running * Snapshot 1. Snapshot $ qemu-img snapshot -c name sheepdog:Alice -c flag is meaningless currently 2. After getting snapshot, a new virtual machine images are added as a not- current image. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:21 c0000 s Alice 1 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 3. You can boot from the snapshot image by spcifing tag id $ qemu-system-x86_64 -hda sheepdog:Alice:1 * Cloning from the snapshot 1. Create a Charlie image as a clone of Alice's image. $ qemu-img create -b sheepdog:Alice:1 sheepdog:Charlie 2. Charlie's image is added to the virtual machine list. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:21 c0000 s Alice 1 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 Charlie 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:23 100000 Test Environment ---------------- - Debian squeeze amd64 - Debian lenny amd64 =============================================================================== Copyright (C) 2009-2011, Nippon Telegraph and Telephone Corporation. sheepdog-0.8.3/autogen.sh000077500000000000000000000002361237656255000153350ustar00rootroot00000000000000#!/bin/sh # Run this to generate all the initial makefiles, etc. echo Building configuration system... autoreconf -i -f && echo Now run ./configure and make sheepdog-0.8.3/configure.ac000066400000000000000000000334521237656255000156300ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # # bootstrap / init m4_define([sheepdog_version], [0.8.3]) m4_define([git_version], m4_esyscmd([git describe --tags --dirty 2> /dev/null | sed 's/^v//' \ | tr '-' '_' | tr -d '\n'])) AC_INIT([sheepdog], m4_default(git_version, sheepdog_version), [sheepdog@lists.wpkg.org]) AM_INIT_AUTOMAKE([-Wno-portability]) AC_CONFIG_SRCDIR([dog/dog.c]) AC_CONFIG_HEADER([include/config.h]) AC_CANONICAL_HOST AC_LANG([C]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES(yes)]) if make --help 2>&1 | grep -q no-print-directory; then AM_MAKEFLAGS="$AM_MAKEFLAGS --no-print-directory"; fi if make --help 2>&1 | grep -q quiet; then AM_MAKEFLAGS="$AM_MAKEFLAGS --quiet" fi if libtool --help 2>&1 | grep -q quiet; then AM_LIBTOOLFLAGS="--quiet"; fi m4_ifndef([PKG_PROG_PKG_CONFIG], AC_MSG_ERROR([pkg-config not found])) dnl Fix default variables - "prefix" variable if not specified if test "$prefix" = "NONE"; then prefix="/usr" dnl Fix "localstatedir" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi dnl Fix "sysconfdir" variable if not specified if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi dnl Fix "libdir" variable if not specified if test "$libdir" = "\${exec_prefix}/lib"; then if test -e /usr/lib64; then libdir="/usr/lib64" else libdir="/usr/lib" fi fi fi # check stolen from gnulib/m4/gnu-make.m4 if ! ${MAKE-make} --version /cannot/make/this >/dev/null 2>&1; then AC_MSG_ERROR([you don't seem to have GNU make; it is required]) fi AC_PROG_CC AM_PROG_AS AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_MAKE_SET AC_PROG_RANLIB AC_CHECK_PROGS([GROFF], [groff]) AM_MISSING_PROG(AUTOM4TE, autom4te, $missing_dir) # Checks for libraries. AC_CHECK_LIB([socket], [socket]) # Checks for header files. AC_FUNC_ALLOCA AC_HEADER_DIRENT AC_HEADER_STDC AC_HEADER_SYS_WAIT AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \ stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \ sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \ sys/sockio.h utmpx.h]) AC_CHECK_HEADERS([urcu.h urcu/uatomic.h],, AC_MSG_ERROR(liburcu 0.6.0 or later is required)) # Checks for typedefs, structures, and compiler characteristics. AC_C_CONST AC_TYPE_UID_T AC_C_INLINE AC_TYPE_SIZE_T AC_HEADER_TIME AC_C_VOLATILE AC_CHECK_SIZEOF(short) AC_CHECK_SIZEOF(int) AC_CHECK_SIZEOF(long) AC_CHECK_SIZEOF(long long) SIZEOF_SHORT=$ac_cv_sizeof_short SIZEOF_INT=$ac_cv_sizeof_int SIZEOF_LONG=$ac_cv_sizeof_long SIZEOF_LONG_LONG=$ac_cv_sizeof_long_long AC_SUBST(SIZEOF_SHORT) AC_SUBST(SIZEOF_INT) AC_SUBST(SIZEOF_LONG) AC_SUBST(SIZEOF_LONG_LONG) # Checks for header files. AC_CHECK_HEADERS([sys/eventfd.h]) AC_CHECK_HEADERS([sys/signalfd.h]) AC_CHECK_HEADERS([sys/timerfd.h]) # Checks for library functions. AC_FUNC_CLOSEDIR_VOID AC_FUNC_ERROR_AT_LINE AC_REPLACE_FNMATCH AC_FUNC_FORK AC_PROG_GCC_TRADITIONAL AC_FUNC_MALLOC AC_FUNC_MEMCMP AC_FUNC_REALLOC AC_FUNC_SELECT_ARGTYPES AC_TYPE_SIGNAL AC_FUNC_VPRINTF AC_CHECK_FUNCS([alarm alphasort atexit bzero dup2 endgrent endpwent fcntl \ getcwd getpeerucred getpeereid gettimeofday inet_ntoa memmove \ memset mkdir scandir select socket strcasecmp strchr strdup \ strerror strrchr strspn strstr fallocate]) AC_CONFIG_FILES([Makefile dog/Makefile sheep/Makefile sheepfs/Makefile include/Makefile script/Makefile lib/Makefile man/Makefile shepherd/Makefile tests/unit/Makefile tests/unit/mock/Makefile tests/unit/dog/Makefile tests/unit/sheep/Makefile tools/Makefile]) ### Local business # =============================================== # Helpers # =============================================== ## helper for CC stuff cc_supports_flag() { local CFLAGS="$@" AC_MSG_CHECKING([whether $CC supports "$@"]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(){return 0;}])] , [RC=0; AC_MSG_RESULT([yes])], [RC=1; AC_MSG_RESULT([no])]) return $RC } ## cleanup AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr/local;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Sheepdog, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac ## local defines PACKAGE_FEATURES="" LINT_FLAGS="-weak -unrecog +posixlib +ignoresigns -fcnuse \ -badflag -D__gnuc_va_list=va_list -D__attribute\(x\)=" AM_CONDITIONAL(BUILD_SHA1_HW, [[[[ $host = *x86_64* ]]]]) AC_ARG_ENABLE([fatal-warnings], [ --enable-fatal-warnings : enable fatal warnings. ], [ default="no" ]) AC_ARG_ENABLE([debug], [ --enable-debug : enable debug build. ], [ default="no" ]) AC_ARG_ENABLE([unittest], [ --enable-unittest : enable unittest. ], [ default="no" ]) AC_ARG_ENABLE([coverage], [ --enable-coverage : coverage analysis of the codebase. ], [ default="no" ]) AM_CONDITIONAL(BUILD_COVERAGE, test x$enable_coverage = xyes) AC_ARG_ENABLE([corosync], [ --enable-corosync : build corosync cluster driver ],, [ enable_corosync="yes" ],) AM_CONDITIONAL(BUILD_COROSYNC, test x$enable_corosync = xyes) AC_ARG_ENABLE([zookeeper], [ --enable-zookeeper : build zookeeper cluster driver ],, [ enable_zookeeper="no" ],) AM_CONDITIONAL(BUILD_ZOOKEEPER, test x$enable_zookeeper = xyes) AC_ARG_ENABLE([shepherd], [ --enable-shepherd : build shepherd cluster driver ],, [ enable_shepherd="no" ],) AM_CONDITIONAL(BUILD_SHEPHERD, test x$enable_shepherd = xyes) AC_ARG_WITH([initddir], [ --with-initddir=DIR : path to init script directory. ], [ INITDDIR="$withval" ], [ INITDDIR="$sysconfdir/init.d" ]) AC_ARG_ENABLE([trace], [ --enable-trace : enable trace],, [ enable_trace="${enable_debug}" ],) AM_CONDITIONAL(BUILD_TRACE, test x$enable_trace = xyes) PKG_CHECK_MODULES([fuse],[fuse], HAVE_FUSE="yes", HAVE_FUSE="no") AC_ARG_ENABLE([sheepfs], [ --enable-sheepfs : enable sheepfs],, [ enable_sheepfs=$HAVE_FUSE ],) AM_CONDITIONAL(BUILD_SHEEPFS, test x$enable_sheepfs = xyes) AC_ARG_ENABLE([http], [ --enable-http : enable http request service (default no) ],, [ enable_http="no" ],) AM_CONDITIONAL(BUILD_HTTP, test x$enable_http = xyes) CP=cp OS_LDL="-ldl" case "$host_os" in *linux*) AC_DEFINE_UNQUOTED([SHEEPDOG_LINUX], [1], [Compiling for Linux platform]) OS_CFLAGS="" OS_CPPFLAGS="" OS_LDFLAGS="" OS_DYFLAGS="" DARWIN_OPTS="" ;; *) AC_MSG_ERROR([Unsupported OS? hmmmm]) ;; esac AC_SUBST(CP) # *FLAGS handling goes here ENV_CFLAGS="$CFLAGS" ENV_CPPFLAGS="$CPPFLAGS" ENV_LDFLAGS="$LDFLAGS" # debug build stuff if test "x${enable_debug}" = xyes; then AC_DEFINE_UNQUOTED([DEBUG], [1], [Compiling Debugging code]) OPT_CFLAGS="-O0" PACKAGE_FEATURES="$PACKAGE_FEATURES debug" else OPT_CFLAGS="-DNDEBUG" fi # gdb flags if test "x${GCC}" = xyes; then GDB_FLAGS="-ggdb3" else GDB_FLAGS="-g" fi if test "x${enable_corosync}" = xyes; then PKG_CHECK_MODULES([corosync],[corosync]) PKG_CHECK_MODULES([libcpg],[libcpg]) PKG_CHECK_MODULES([libcfg],[libcfg]) AC_DEFINE_UNQUOTED([HAVE_COROSYNC], 1, [have corosync]) PACKAGE_FEATURES="$PACKAGE_FEATURES corosync" fi if test "x${enable_zookeeper}" = xyes; then AC_CHECK_LIB([zookeeper_mt], [zookeeper_init],, AC_MSG_ERROR(libzookeeper not found)) AC_CHECK_HEADERS([zookeeper/zookeeper.h],, AC_MSG_ERROR(zookeeper.h header missing)) AC_DEFINE_UNQUOTED([HAVE_ZOOKEEPER], 1, [have zookeeper]) PACKAGE_FEATURES="$PACKAGE_FEATURES zookeeper" fi if test "x${enable_shepherd}" = xyes; then AC_DEFINE_UNQUOTED([HAVE_SHEPHERD], 1, [have shepherd]) PACKAGE_FEATURES="$PACKAGE_FEATURES shepherd" fi if test "x${enable_trace}" = xyes; then if test "x${enable_coverage}" = xyes; then AC_MSG_ERROR(tracer cannot be used with coverage options) fi if [[[ $host != *x86_64* ]]]; then AC_MSG_ERROR(tracer can be used on x86_64 architectures) fi AC_CHECK_LIB([bfd], [bfd_openr],, AC_MSG_ERROR(requires binutils-dev)) AC_CHECK_HEADERS([bfd.h],, AC_MSG_ERROR(requires binutils-dev)) AC_CHECK_LIB([rt], [clock_gettime],, AC_MSG_ERROR(librt not found)) AC_DEFINE_UNQUOTED([HAVE_TRACE], 1, [have trace]) PACKAGE_FEATURES="$PACKAGE_FEATURES trace" fi if test "x${enable_sheepfs}" = xyes; then AC_CHECK_HEADERS([fuse.h],, AC_MSG_ERROR(fuse.h header missing), [#define _FILE_OFFSET_BITS 64]) if test "x${enable_http}" = xyes; then AC_CHECK_LIB([curl], [curl_easy_init],, AC_MSG_ERROR(libcurl not found)) AC_CHECK_HEADERS([curl/curl.h],, AC_MSG_ERROR(curl.h header missing)) fi AC_DEFINE_UNQUOTED([HAVE_SHEEPFS], 1, [have sheepfs]) PACKAGE_FEATURES="$PACKAGE_FEATURES sheepfs" PKG_CHECK_EXISTS(fuse >= 2.8.0, [AC_DEFINE_UNQUOTED([FUSE_SUPPORT_BIGWRITES], 1, [Support -obig_writes for fuse])]); fi if test "x${enable_http}" = xyes; then AC_CHECK_HEADERS([fcgiapp.h],, AC_MSG_ERROR(fcgiapp.h header not found)) AC_CHECK_LIB([fcgi], [FCGX_Accept],, AC_MSG_ERROR(libfcgi not found)) AC_DEFINE_UNQUOTED(HAVE_HTTP, 1, [have http]) PACKAGE_FEATURES="$PACKAGE_FEATURES http" fi # extra warnings EXTRA_WARNINGS="" WARNLIST=" all shadow missing-prototypes missing-declarations strict-prototypes pointer-arith write-strings bad-function-cast missing-format-attribute format=2 format-security format-nonliteral no-long-long unsigned-char gnu89-inline no-strict-aliasing " case "${host}" in arm*) ;; *) WARNLIST="${WARNLIST} cast-align" ;; esac for j in $WARNLIST; do if cc_supports_flag -W$j; then EXTRA_WARNINGS="$EXTRA_WARNINGS -W$j"; fi done if test "x${enable_coverage}" = xyes && \ cc_supports_flag -ftest-coverage && \ cc_supports_flag -fprofile-arcs ; then AC_MSG_NOTICE([Enabling Coverage (enable -O0 by default)]) OPT_CFLAGS="-O0" COVERAGE_CFLAGS="-ftest-coverage -fprofile-arcs" COVERAGE_LDFLAGS="-ftest-coverage -fprofile-arcs" PACKAGE_FEATURES="$PACKAGE_FEATURES coverage" enable_unittest="yes" else COVERAGE_CFLAGS="" COVERAGE_LDFLAGS="" fi if test "x${enable_unittest}" = xyes; then PKG_CHECK_MODULES([CHECK], [check >= 0.9.4]) fi AM_CONDITIONAL(BUILD_UNITTEST, test x$enable_unittest = xyes) if test "x${enable_fatal_warnings}" = xyes && \ cc_supports_flag -Werror ; then AC_MSG_NOTICE([Enabling Fatal Warnings (-Werror)]) WERROR_CFLAGS="-Werror" PACKAGE_FEATURES="$PACKAGE_FEATURES fatal-warnings" else WERROR_CFLAGS="" fi if test "x${enable_trace}" = xyes && \ cc_supports_flag -pg ; then AC_MSG_NOTICE([Enabling trace (-pg)]) TRACE_CFLAGS="-pg" else TRACE_CFLAGS="" fi # final build of *FLAGS CFLAGS="$ENV_CFLAGS $OPT_CFLAGS $GDB_FLAGS $OS_CFLAGS \ $TRACE_CFLAGS $COVERAGE_CFLAGS $EXTRA_WARNINGS $WERROR_CFLAGS \ -D_GNU_SOURCE -D_LGPL_SOURCE -std=gnu99" CPPFLAGS="$ENV_CPPFLAGS $ANSI_CPPFLAGS $OS_CPPFLAGS" LDFLAGS="$ENV_LDFLAGS $COVERAGE_LDFLAGS $OS_LDFLAGS" # substitute what we need: AC_SUBST([OS_DYFLAGS]) AM_CONDITIONAL(BUILD_HTML_DOCS, test -n "${GROFF}") AC_SUBST([INITDDIR]) AC_SUBST([LINT_FLAGS]) AC_DEFINE_UNQUOTED([LOCALSTATEDIR], "$(eval echo ${localstatedir})", [localstate directory]) COROSYSCONFDIR=${sysconfdir}/sheepdog AC_SUBST([COROSYSCONFDIR]) AC_DEFINE_UNQUOTED([COROSYSCONFDIR], "$(eval echo ${COROSYSCONFDIR})", [sheepdog config directory]) AC_DEFINE_UNQUOTED([PACKAGE_FEATURES], "${PACKAGE_FEATURES}", [sheepdog built-in features]) AC_OUTPUT AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE configuration:]) AC_MSG_RESULT([ Version = ${VERSION}]) AC_MSG_RESULT([ Prefix = ${prefix}]) AC_MSG_RESULT([ Executables = ${sbindir}]) AC_MSG_RESULT([ Man pages = ${mandir}]) AC_MSG_RESULT([ Doc dir = ${docdir}]) AC_MSG_RESULT([ Libraries = ${libdir}]) AC_MSG_RESULT([ Header files = ${includedir}]) AC_MSG_RESULT([ Arch-independent files = ${datadir}]) AC_MSG_RESULT([ State information = ${localstatedir}]) AC_MSG_RESULT([ System configuration = ${sysconfdir}]) AC_MSG_RESULT([ System init.d directory = ${INITDDIR}]) AC_MSG_RESULT([ sheepdog config dir = ${COROSYSCONFDIR}]) AC_MSG_RESULT([ Features =${PACKAGE_FEATURES}]) AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE build info:]) AC_MSG_RESULT([ Library SONAME = ${SONAME}]) AC_MSG_RESULT(m4_shift(local_soname_list)) AC_MSG_RESULT([ Default optimization = ${OPT_CFLAGS}]) AC_MSG_RESULT([ Default debug options = ${GDB_CFLAGS}]) AC_MSG_RESULT([ Extra compiler warnings = ${EXTRA_WARNING}]) AC_MSG_RESULT([ Env. defined CFLAG = ${ENV_CFLAGS}]) AC_MSG_RESULT([ Env. defined CPPFLAGS = ${ENV_CPPFLAGS}]) AC_MSG_RESULT([ Env. defined LDFLAGS = ${ENV_LDFLAGS}]) AC_MSG_RESULT([ OS defined CFLAGS = ${OS_CFLAGS}]) AC_MSG_RESULT([ OS defined CPPFLAGS = ${OS_CPPFLAGS}]) AC_MSG_RESULT([ OS defined LDFLAGS = ${OS_LDFLAGS}]) AC_MSG_RESULT([ OS defined LDL = ${OS_LDL}]) AC_MSG_RESULT([ OS defined DYFLAGS = ${OS_DYFLAGS}]) AC_MSG_RESULT([ ANSI defined CPPFLAGS = ${ANSI_CPPFLAGS}]) AC_MSG_RESULT([ Coverage CFLAGS = ${COVERAGE_CFLAGS}]) AC_MSG_RESULT([ Coverage LDFLAGS = ${COVERAGE_LDFLAGS}]) AC_MSG_RESULT([ Fatal War. CFLAGS = ${WERROR_CFLAGS}]) AC_MSG_RESULT([ Trace CFLAGS = ${TRACE_CFLAGS}]) AC_MSG_RESULT([ Final CFLAGS = ${CFLAGS}]) AC_MSG_RESULT([ Final CPPFLAGS = ${CPPFLAGS}]) AC_MSG_RESULT([ Final LDFLAGS = ${LDFLAGS}]) sheepdog-0.8.3/debian/000077500000000000000000000000001237656255000145555ustar00rootroot00000000000000sheepdog-0.8.3/debian/.gitignore000066400000000000000000000001561237656255000165470ustar00rootroot00000000000000*.substvars *.debhelper *.debhelper.log autoreconf.after autoreconf.before files sheepdog CHANGELOG changelog sheepdog-0.8.3/debian/compat000066400000000000000000000000021237656255000157530ustar00rootroot000000000000009 sheepdog-0.8.3/debian/control000066400000000000000000000017041237656255000161620ustar00rootroot00000000000000Source: sheepdog Section: admin Priority: optional Maintainer: PKG OpenStack Uploaders: YunQiang Su Build-Depends: debhelper (>= 9), dh-autoreconf, bash-completion, pkg-config, libcorosync-dev, liburcu-dev, libzookeeper-mt-dev [linux-any], libfuse-dev, po-debconf Standards-Version: 3.9.4 Homepage: http://sheepdog.github.io/sheepdog Vcs-Browser: http://anonscm.debian.org/?p=openstack/sheepdog.git Vcs-Git: git://anonscm.debian.org/openstack/sheepdog.git Package: sheepdog Architecture: any Pre-Depends: dpkg (>= 1.15.6~) Depends: ${shlibs:Depends}, ${misc:Depends} Recommends: corosync Description: distributed storage system for QEMU Sheepdog provides highly available block level storage volumes that can be attached to QEMU virtual machines. Sheepdog scales to several hundred nodes, and supports advanced volume management features such as snapshots, cloning, and thin provisioning. sheepdog-0.8.3/debian/copyright000066400000000000000000000030101237656255000165020ustar00rootroot00000000000000Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Contact: MORITA Kazutaka , Liu Yuan Upstream-Name: sheepdog Source: git://github.com/sheepdog/sheepdog.git Files: debian/* Copyright: 2010, Guido Günther 2012, YunQiang Su 2012, Thomas Goirand License: GPL-2 Files: * Copyright: 2009-2011 Nippon Telegraph and Telephone Corporation With upstream authors as folow: 2009-2011, MORITA Kazutaka 2009-2011, FUJITA Tomonori 2009-2011, MORIAI Satoshi License: GPL-2 License: GPL-2 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA . On Debian systems, the complete text of the GNU General Public License v2 (GPL) can be found in /usr/share/common-licenses/GPL-2. sheepdog-0.8.3/debian/debian-sheepdog-default000066400000000000000000000015221237656255000211400ustar00rootroot00000000000000# start sheepdog at boot [yes|no] START="yes" # Arguments to run the daemon with # Options: # -p, --port specify the TCP port on which to listen # -l, --loglevel specify the level of logging detail # -d, --debug include debug messages in the log # -D, --directio use direct IO when accessing the object store # -z, --zone specify the zone id # -c, --cluster specify the cluster driver DAEMON_ARGS="" # SHEEPDOG_PATH # Proper LSB systems will store sheepdog files in /var/lib/sheepdog. The init script uses this directory by default. # The directory must be on a filesystem with xattr support. In the case of ext3, user_xattr should be added to the # mount options. # # mount -o remount,user_xattr /var/lib/shepdog SHEEPDOG_PATH="/var/lib/sheepdog" sheepdog-0.8.3/debian/docs000066400000000000000000000000071237656255000154250ustar00rootroot00000000000000README sheepdog-0.8.3/debian/gbp.conf000066400000000000000000000002401237656255000161700ustar00rootroot00000000000000[DEFAULT] upstream-branch = master debian-branch = debian/unstable upstream-tag = v%(version)s compression = xz [git-buildpackage] export-dir = ../build-area/ sheepdog-0.8.3/debian/po/000077500000000000000000000000001237656255000151735ustar00rootroot00000000000000sheepdog-0.8.3/debian/po/POTFILES.in000066400000000000000000000000551237656255000167500ustar00rootroot00000000000000[type: gettext/rfc822deb] sheepdog.templates sheepdog-0.8.3/debian/rules000077500000000000000000000040361237656255000156400ustar00rootroot00000000000000#!/usr/bin/make -f # -*- makefile -*- DEBVERS ?= $(shell dpkg-parsechangelog | sed -n -e 's/^Version: //p') VERSION ?= $(shell echo '$(DEBVERS)' | sed -e 's/^[[:digit:]]*://' -e 's/[-].*//') DEBFLAVOR ?= $(shell dpkg-parsechangelog | grep -E ^Distribution: | cut -d" " -f2) DEBPKGNAME ?= $(shell dpkg-parsechangelog | grep -E ^Source: | cut -d" " -f2) UPSTREAM_GIT ?= git://github.com/sheepdog/sheepdog.git GIT_TAG ?= $(shell echo v'$(VERSION)' | sed -e 's/~/_/') %: dh $@ --with autoreconf override_dh_builddeb: dh_builddeb -- -Zxz -z9 override_dh_autoreconf: dh_autoreconf --mode=timesize override_dh_auto_build: dh_auto_build --parallel UNAME := $(shell uname) ifeq ($(UNAME),Linux) ZOOKEEPER=--enable-zookeeper endif override_dh_auto_configure: dh_auto_configure -- ${ZOOKEEPER} override_dh_install: dh_install rm -rf debian/sheepdog/etc/init.d/ dh_bash-completion get-vcs-source: git remote add upstream $(UPSTREAM_GIT) || true git fetch upstream if [ ! -f ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ] ; then \ git archive --prefix=$(DEBPKGNAME)-$(GIT_TAG)/ $(GIT_TAG) | xz >../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ; \ fi if [ ! -e ../build-area ] ; then mkdir ../build-area ; fi if [ ! -e ../build-area ] ; then cp ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ../build-area ; fi if ! git checkout master ; then \ echo "No upstream branch: checking out" ; \ git checkout -b master upstream/master ; \ fi git checkout debian/$(DEBFLAVOR) display-po-stats: cd $(CURDIR)/debian/po; for i in *.po ;do \ echo -n $$i": ";\ msgfmt -o /dev/null --statistic $$i ; \ done call-for-po-trans: podebconf-report-po --call --withtranslators --languageteam gen-upstream-changelog: git checkout master git reset --hard $(GIT_TAG) git log >$(CURDIR)/../CHANGELOG git checkout debian/$(DEBFLAVOR) mv $(CURDIR)/../CHANGELOG $(CURDIR)/debian/CHANGELOG git add $(CURDIR)/debian/CHANGELOG git commit -a -m "Updated upstream changelog" override_dh_installchangelogs: dh_installchangelogs $(CURDIR)/debian/CHANGELOG sheepdog-0.8.3/debian/sheepdog.bash-completion000066400000000000000000000001011237656255000213510ustar00rootroot00000000000000script/bash_completion_dog dog script/bash_completion_dog collie sheepdog-0.8.3/debian/sheepdog.config000066400000000000000000000005631237656255000175460ustar00rootroot00000000000000#!/bin/sh set -e . /usr/share/debconf/confmodule if [ -r /etc/default/sheepdog ] ; then . /etc/default/sheepdog if [ x"yes" = x"$START" ] ; then db_set sheepdog/start true else db_set sheepdog/start false fi db_set sheepdog/daemon_args "$DAEMON_ARGS" fi db_input medium sheepdog/start || true db_input medium sheepdog/daemon_args || true db_go || true exit 0 sheepdog-0.8.3/debian/sheepdog.init000066400000000000000000000101041237656255000172340ustar00rootroot00000000000000#!/bin/sh ### BEGIN INIT INFO # Provides: sheepdog # Required-Start: hostname $network $remote_fs $syslog # Required-Stop: $remote_fs # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: Sheepdog is a distributed storage system for KVM/QEMU. # Description: Sheepdog is a distributed storage system for KVM/QEMU. It provides # highly available block level storage volumes to virtual machines. # Sheepdog supports advanced volume management features such as snapshot, # cloning, and thin provisioning. The architecture of Sheepdog is fully # symmetric; there is no central node such as a meta-data server. ### END INIT INFO # Author: YunQiang Su # PATH should only include /usr/* if it runs after the mountnfs.sh script PATH=/sbin:/usr/sbin:/bin:/usr/bin DESC=sheepdog # Introduce a short description here NAME=sheepdog # Introduce the short server's name here DAEMON=/usr/sbin/sheep # Introduce the server's location here DAEMON_ARGS="" # Arguments to run the daemon with PIDFILE=/var/run/$NAME.pid SCRIPTNAME=/etc/init.d/$NAME # Exit if the package is not installed [ -x $DAEMON ] || exit 0 # Read configuration variable file if it is present [ -r /etc/default/$NAME ] && . /etc/default/$NAME #FIXME: user cannot give pidfile in /etc/default/sheepdog DAEMON_ARGS="$DAEMON_ARGS --pidfile $PIDFILE" if [ "$START" != "yes" ]; then exit 0 fi # Define LSB log_* functions. # Depend on lsb-base (>= 3.0-6) to ensure that this file is present. . /lib/lsb/init-functions # # Function that starts the daemon/service # do_start() { # Return # 0 if daemon has been started # 1 if daemon was already running # 2 if daemon could not be started start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --test > /dev/null \ || return 1 start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON -- \ $DAEMON_ARGS $SHEEPDOG_PATH \ || return 2 # Add code here, if necessary, that waits for the process to be ready # to handle requests from services started subsequently which depend # on this one. As a last resort, sleep for some time. } # # Function that stops the daemon/service # do_stop() { # Return # 0 if daemon has been stopped # 1 if daemon was already stopped # 2 if daemon could not be stopped # other if a failure occurred start-stop-daemon --stop --quiet --pidfile $PIDFILE RETVAL="$?" return "$RETVAL" } # # Function that sends a SIGHUP to the daemon/service # do_reload() { # # If the daemon can reload its configuration without # restarting (for example, when it is sent a SIGHUP), # then implement that here. # start-stop-daemon --stop --signal 1 --quiet --pidfile $PIDFILE --name $NAME return 0 } case "$1" in start) [ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC " "$NAME" do_start case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; stop) [ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME" do_stop case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; status) status_of_proc "$DAEMON" "$NAME" && exit 0 || exit $? ;; #reload|force-reload) # # If do_reload() is not implemented then leave this commented out # and leave 'force-reload' as an alias for 'restart'. # #log_daemon_msg "Reloading $DESC" "$NAME" #do_reload #log_end_msg $? #;; restart|force-reload) # # If the "reload" option is implemented then remove the # 'force-reload' alias # log_daemon_msg "Restarting $DESC" "$NAME" do_stop case "$?" in 0|1) do_start case "$?" in 0) log_end_msg 0 ;; 1) log_end_msg 1 ;; # Old process is still running *) log_end_msg 1 ;; # Failed to start esac ;; *) # Failed to stop log_end_msg 1 ;; esac ;; *) #echo "Usage: $SCRIPTNAME {start|stop|restart|reload|force-reload}" >&2 echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 exit 3 ;; esac exit 0 sheepdog-0.8.3/debian/sheepdog.install000066400000000000000000000000631237656255000177420ustar00rootroot00000000000000debian/debian-sheepdog-default /usr/share/sheepdog sheepdog-0.8.3/debian/sheepdog.links000066400000000000000000000000371237656255000174150ustar00rootroot00000000000000/usr/sbin/dog /usr/sbin/collie sheepdog-0.8.3/debian/sheepdog.postinst000066400000000000000000000012111237656255000201530ustar00rootroot00000000000000#!/bin/sh set -e if [ "$1" = "configure" ] ; then . /usr/share/debconf/confmodule mkdir -p /var/lib/sheepdog/ mkdir -p /etc/default if [ ! -e /etc/default/sheepdog ] ; then cp /usr/share/sheepdog/debian-sheepdog-default /etc/default/sheepdog fi if [ -r /etc/default/sheepdog ] ; then db_get sheepdog/start if [ "${RET}" = "true" ] ; then SERVICE_START="yes" else SERVICE_START="no" fi sed -i -e "s/^[ \t]*START=.*/START=\"$SERVICE_START\"/g" /etc/default/sheepdog db_get sheepdog/daemon_args sed -i -e "s/^[ \t]*DAEMON_ARGS=.*/DAEMON_ARGS=\"$RET\"/g" /etc/default/sheepdog fi db_stop || true fi #DEBHELPER# exit 0 sheepdog-0.8.3/debian/sheepdog.postrm000066400000000000000000000002011237656255000176120ustar00rootroot00000000000000#!/bin/sh set -e if [ "${1}" = "purge" ] ; then rm -f /etc/default/sheepdog rm -rf /var/lib/sheepdog fi #DEBHELPER# exit 0 sheepdog-0.8.3/debian/sheepdog.templates000066400000000000000000000023651237656255000203010ustar00rootroot00000000000000# These templates have been reviewed by the debian-l10n-english # team # # If modifications/additions/rewording are needed, please ask # debian-l10n-english@lists.debian.org for advice. # # Even minor modifications require translation updates and such # changes should be coordinated with translators and reviewers. Template: sheepdog/start Type: boolean Default: false _Description: Automatically start the sheepdog service? Please choose whether the sheepdog service should start automatically when the system is booted. Template: sheepdog/daemon_args Type: string Default: _Description: Arguments for the sheepdog daemon: Please choose the command line arguments that should be passed to the sheepdog daemon. If no argument is given, the default behavior is to start on port 7000, using the corosync driver. . Available options include: -p, --port specify the TCP port to listen to -l, --loglevel specify the level of logging detail -d, --debug include debug messages in the log -D, --directio use direct I/O when accessing the object store -z, --zone specify the zone ID -c, --cluster specify the cluster driver More information can be found in the sheep(8) manual page. sheepdog-0.8.3/debian/source/000077500000000000000000000000001237656255000160555ustar00rootroot00000000000000sheepdog-0.8.3/debian/source/format000066400000000000000000000000141237656255000172630ustar00rootroot000000000000003.0 (quilt) sheepdog-0.8.3/debian/watch000066400000000000000000000001041237656255000156010ustar00rootroot00000000000000version=3 https://github.com/sheepdog/sheepdog/tags .*/v(.*).tar.gz sheepdog-0.8.3/doc/000077500000000000000000000000001237656255000141005ustar00rootroot00000000000000sheepdog-0.8.3/doc/about_this_guide.rst000066400000000000000000000014451237656255000201540ustar00rootroot00000000000000About This Guide ================ This guide is for users who intend to install and administer a sheepdog cluster. It's divided in three main sections: - Sheepdog Basics - Sheepdog Advanced Use - Suggestions And Special Cases You have to be familiar with GNU/Linux, the shell, networking and virtualization (in general), ssh and kvm (specifically). In this guide we use a "simple" scenario of a small cluster with 4 nodes. Here are some of the terms used in this document: :: cluster = group of servers host = a server vm / guest = a virtual machine running on a host node = a host running sheepdog daemon vdi = virtual disk used by sheepdog sheep = vary according to the context. It may be - the sheepdog daemon process name - the command to run the daemon sheepdog-0.8.3/doc/api-strbuf.txt000066400000000000000000000147561237656255000167320ustar00rootroot00000000000000strbuf API ========== strbuf's are meant to be used with all the usual C string and memory APIs. Given that the length of the buffer is known, it's often better to use the mem* functions than a str* one (memchr vs. strchr e.g.). Though, one has to be careful about the fact that str* functions often stop on NULs and that strbufs may have embedded NULs. An strbuf is NUL terminated for convenience, but no function in the strbuf API actually relies on the string being free of NULs. strbufs has some invariants that are very important to keep in mind: . The `buf` member is never NULL, so it can be used in any usual C string operations safely. strbuf's _have_ to be initialized either by `strbuf_init()` or by `= STRBUF_INIT` before the invariants, though. + Do *not* assume anything on what `buf` really is (e.g. if it is allocated memory or not), use `strbuf_detach()` to unwrap a memory buffer from its strbuf shell in a safe way. That is the sole supported way. This will give you a malloced buffer that you can later `free()`. + However, it is totally safe to modify anything in the string pointed by the `buf` member, between the indices `0` and `len-1` (inclusive). . The `buf` member is a byte array that has at least `len + 1` bytes allocated. The extra byte is used to store a `'\0'`, allowing the `buf` member to be a valid C-string. Every strbuf function ensure this invariant is preserved. + NOTE: It is OK to "play" with the buffer directly if you work it this way: + ---- strbuf_grow(sb, SOME_SIZE); <1> strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE); ---- <1> Here, the memory array starting at `sb->buf`, and of length `strbuf_avail(sb)` is all yours, and you can be sure that `strbuf_avail(sb)` is at least `SOME_SIZE`. + NOTE: `SOME_OTHER_SIZE` must be smaller or equal to `strbuf_avail(sb)`. + Doing so is safe, though if it has to be done in many places, adding the missing API to the strbuf module is the way to go. + WARNING: Do _not_ assume that the area that is yours is of size `alloc - 1` even if it's true in the current implementation. Alloc is somehow a "private" member that should not be messed with. Use `strbuf_avail()` instead. Data structures --------------- * `struct strbuf` This is the string buffer structure. The `len` member can be used to determine the current length of the string, and `buf` member provides access to the string itself. Functions --------- * Life cycle `strbuf_init`:: Initialize the structure. The second parameter can be zero or a bigger number to allocate memory, in case you want to prevent further reallocs. `strbuf_release`:: Release a string buffer and the memory it used. You should not use the string buffer after using this function, unless you initialize it again. `strbuf_detach`:: Detach the string from the strbuf and returns it; you now own the storage the string occupies and it is your responsibility from then on to release it with `free(3)` when you are done with it. `strbuf_attach`:: Attach a string to a buffer. You should specify the string to attach, the current length of the string and the amount of allocated memory. The amount must be larger than the string length, because the string you pass is supposed to be a NUL-terminated string. This string _must_ be malloc()ed, and after attaching, the pointer cannot be relied upon anymore, and neither be free()d directly. * Related to the size of the buffer `strbuf_avail`:: Determine the amount of allocated but unused memory. `strbuf_grow`:: Ensure that at least this amount of unused memory is available after `len`. This is used when you know a typical size for what you will add and want to avoid repetitive automatic resizing of the underlying buffer. This is never a needed operation, but can be critical for performance in some cases. `strbuf_setlen`:: Set the length of the buffer to a given value. This function does *not* allocate new memory, so you should not perform a `strbuf_setlen()` to a length that is larger than `len + strbuf_avail()`. `strbuf_setlen()` is just meant as a 'please fix invariants from this strbuf I just messed with'. `strbuf_reset`:: Empty the buffer by setting the size of it to zero. * Related to the contents of the buffer `strbuf_rtrim`:: Strip whitespace from the end of a string. * Adding data to the buffer NOTE: All of the functions in this section will grow the buffer as necessary. If they fail for some reason other than memory shortage and the buffer hadn't been allocated before (i.e. the `struct strbuf` was set to `STRBUF_INIT`), then they will free() it. `strbuf_addch`:: Add a single character to the buffer. `strbuf_insert`:: Insert data to the given position of the buffer. The remaining contents will be shifted, not overwritten. `strbuf_remove`:: Remove given amount of data from a given position of the buffer. `strbuf_splice`:: Remove the bytes between `pos..pos+len` and replace it with the given data. `strbuf_add`:: Add data of given length to the buffer. `strbuf_addstr`:: Add a NUL-terminated string to the buffer. + NOTE: This function will *always* be implemented as an inline or a macro that expands to: + ---- strbuf_add(..., s, strlen(s)); ---- + Meaning that this is efficient to write things like: + ---- strbuf_addstr(sb, "immediate string"); ---- `strbuf_addbuf`:: Copy the contents of an other buffer at the end of the current one. `strbuf_addf`:: Add a formatted string to the buffer. `strbuf_fread`:: Read a given size of data from a FILE* pointer to the buffer. + NOTE: The buffer is rewound if the read fails. If -1 is returned, `errno` must be consulted, like you would do for `read(3)`. `strbuf_read()`, `strbuf_read_file()` and `strbuf_getline()` has the same behaviour as well. `strbuf_read`:: Read the contents of a given file descriptor. The third argument can be used to give a hint about the file size, to avoid reallocs. `strbuf_getline`:: Read a line from a FILE *, overwriting the existing contents of the strbuf. The second argument specifies the line terminator character, typically `'\n'`. Reading stops after the terminator or at EOF. The terminator is removed from the buffer before returning. Returns 0 unless there was nothing left before EOF, in which case it returns `EOF`. `strbuf_copyout`:: Copy the contents of the strbuf to the second argument 'buf'. The number of bytes to be copied is at most the third argument 'len'. `strbuf_stripout`:: Strip out the contents of the strbuf to the second argument 'buf'. The number of bytes to be copied is at most the third argument 'len'. sheepdog-0.8.3/doc/author_and_licensing.rst000066400000000000000000000010031237656255000210030ustar00rootroot00000000000000Author And Licensing ==================== |cc logo| This manual has been created by *Valerio Pachera* for the Sheepdog Project and it's released under `Creative Commons Attribution 3.0 Unported`_ license. .. _`Creative Commons Attribution 3.0 Unported`: http://creativecommons.org/licenses/by/3.0/deed.it .. |cc logo| image:: http://i.creativecommons.org/l/by/3.0/88x31.png :scale: 200 % :alt: Creative Commons Logo :target: http://creativecommons.org/licenses/by/3.0/deed.it .. commento sheepdog-0.8.3/doc/farm-internal.txt000066400000000000000000000114521237656255000174030ustar00rootroot00000000000000 ================== Farm Store ================== Liu Yuan Taobao Inc. 1. OVERVIEW Farm is an object store for Sheepdog on node basis. It consists of backend store, which caches the snapshot objects, and working directory, storing objects that Sheepdog currently operates. That being said, the I/O performance for VM Guests would be practically the same as Simple Store. [*] [*] Simple Store is an older storage backend which has been removed from the tree. Snapshots are triggered either by system recovery code or users, and Farm is supposed to restore all the object states into the ones at the time of the user snapshot being taken. Snapshot object in the context means both meta object and data object. 2. DESIGN Simply put, Farm somewhat resembles git a lot (both code and idea level). there are three object type, named 'data, trunk, snapshot[*]' that is similar to git's 'blob, tree, commit'. [*] shorten to 'snap' below. 'data' object is just Sheepdog's I/O object, only named by its sha1-ed content. So the data objects with the same content will be mapped to only single sha1 file, thus achieve node-wide data sharing. 'trunk' object ties data objects together into a flat directory structure at the time of the snapshot being taken. The trunk object provides a means to find old data objects in the store. 'snap' object describes the snapshot, either initiated by users or triggered by recovery code. The snap object refers to one of the trunk objects. The two snap log files provides a means to name the desired snap object. All the objects are depicted in the context of snapshotting or retrieving old data from the snapshotted objects, that is, those objects are 'cached' into Farm store by performing snapshot operations. 2. OBJECT LAYOUT All the objects(snap, trunk, data) in the Farm is based on the operations of the sha1_file. sha1_file provides us compressed and consistency-aware characteristics independent of content or the type of the object. The object successfully inflates to a stream of bytes that forms a sequence of + | | header payload The payload of the data object is the compressed content of Sheepdog's I/O object. For trunk object, the compressed content is struct trunk_entry { uint64_t oid; unsigned char sha1[SHA1_LEN]; }; For snap object, the compressed content is + As for snap operations, besides snap object, Farm has two log files with the below structure struct snap_log { uint32_t epoch; uint64_t time; unsigned char sha1[SHA1_LEN]; }; This provides an internal naming mechanism and help us find snap objects by epoch. 3. STALE OBJECT For storing one object into backend store when the snapshot is taken, either a) no content change, then point to the same old sha1_file (no stale object) or b) content updated, then will point to a new object with a new sha1. We need to remove stale object in case b), only in the assumption that it is the object generated by recovery code. [*] When we try store new snapshot object into the backend store, it is safe and good timing for us to remove the old object with the same object ID. For user snapshot objects, we don't need to remove them until the snapshot is deleted. [*] Here I assume we don't need to restore to 'sys epoch' state. 4. FLOW FIGURE sys_snap, user_snap snapshot requests | | |put/get snap_sha1 | trigger v | +----------+ +------+ +--------+ v +----------+ | |<------>| snap |<++++++>| | <========> | | | | +------+ | | | Farm | | | | trunk | | Working | I/O +-------+ | |<---------------------->| | | Directory| <~~~~~~>|sheep | | Farm | +--------+ | | +-------+ | Backend | | | | Store | | | | |<-------------------------------------------->| | | | | | +----------+ +----------+ <-----> put/get objects to/from Farm Store <+++++> put/get trunk_sha1 to/from snap object <=====> put/get oid/oid_sha1 pairs to/from trunk object sheepdog-0.8.3/doc/goals.rst000066400000000000000000000045061237656255000157440ustar00rootroot00000000000000Goals ===== Common Problems Related To Virtual Server Management: ***************************************************** Host Dependence: A virtualized environment allows several operation systems to run on a single physical server. This results in better resource utilization but it implies more services going down if the host has problems. Eventually we'll have to do some maintenance on our servers or they may simply stop working. When we stop a host, it would be a good thing to be able to run its guests on another server. Downtime: We may copy the guests files to a second server or even move the physical hard disks on it. This requires time, effort and probably the physical access to the servers. Single Point of Failure: Another common solution is to have a shared storage. That simply means to have a nfs folder on a nas like server, where our guests disks are stored. This way, if the server running the virtual machines (name it front-end server) needs maintenance, we may simply start the guests on the second server. But what do we do if the nas breaks down? Resource waste: The disadvantage of this approach is that we need more hardware for the back-end (the shared storage) but we still have to worry about the back-end failover. Furthermore, the hard disks on the front-end hosts are almost useless. Raid complexity: It's common practice to use RAID (1,5,6,10) to avoid down time due to disk failure. This implies to buy hardware controllers or to use software RAID. Sheepdog Benefits ***************** Host Independence: We can run any guest on any host of the cluster, as with a common shared storage. Less Downtime: Because of the host independence, we do not need to fix the broken host before running its guests. No Single Point of Failure: There's not a single shared storage. Multiple hosts failure can be easily handled. Less Resource Waste: Each node is, at the same time, a virtualization and storage server. No Raid Necessary: Sheepdog is not limited to a single device per host, but as many as we wish. You don't need to configure a RAID Software or buy a RAID Controller. It will manage (on a single node) as much as unlimited disks in a RAID0 like way. sheepdog-0.8.3/doc/index.rst000066400000000000000000000023411237656255000157410ustar00rootroot00000000000000 SHEEPDOG ADMINISTRATOR GUIDE ============================ Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes that can be attached to QEMU virtualmachines. Sheepdog scales to several hundreds nodes, and supports advanced volume management features such as snapshot, cloning, and thin provisioning. Introduction ============ .. toctree:: :maxdepth: 2 about_this_guide.rst author_and_licensing.rst project_history.rst project_status.rst main_futures.rst goals.rst Sheepdog Basic ============== .. toctree:: :maxdepth: 2 concepts.rst installation.rst configuration.rst start_the_cluster.rst dog_intro.rst create_new_disk.rst monitor_cluster.rst fail_over.rst verify_vdi.rst stop_and_restart_cluster.rst backup.rst Sheepdog Advanced ================= .. toctree:: :maxdepth: 2 more_concepts.rst multidevice.rst cache.rst journal.rst more_network_cards.rst snapshot.rst vdi_read_and_write.rst more_about_backup.rst misc.rst Suggestions And Special Cases ============================= .. toctree:: :maxdepth: 2 optimization.rst sheepdog-0.8.3/doc/main_futures.rst000066400000000000000000000005211237656255000173310ustar00rootroot00000000000000Main Futures ============ - Simple implementation (2 services only) and intuitive command line. - Import of existing disks (qcow2, raw). - Export of Vdi to file (all type supported by qemu). - Vdi snapshot. - Vdi cloning. - Vdi resize. - Vdi preallocation. - Cluster snapshot. - Dedicated nic for data sync. - Support of live migration.sheepdog-0.8.3/doc/project_history.rst000066400000000000000000000001441237656255000200600ustar00rootroot00000000000000Project History =============== Sheepdog project has been founded by MORITA Kazutaka on 2010 sheepdog-0.8.3/doc/project_status.rst000066400000000000000000000013401237656255000177010ustar00rootroot00000000000000Project Status ============== Sheepdog is actively developed and more supporters are joining the project. It has reached a good maturity level. You may use the stable branch or development to test the latest improvements. You may have a look at the `Developers Mailing List Archive`_ to have an idea of the project vitality. We recommend you to join the `Users Mailing List`_ to ask for help and/or offer some help. If you are an interested developer, please join the `Developers Mailing List`_ . .. _`Developers Mailing List Archive`: http://lists.wpkg.org/pipermail/sheepdog .. _`Users Mailing List`: http://lists.wpkg.org/mailman/listinfo/sheepdog .. _`Developers Mailing List`: http://lists.wpkg.org/mailman/listinfo/sheepdog sheepdog-0.8.3/dog/000077500000000000000000000000001237656255000141045ustar00rootroot00000000000000sheepdog-0.8.3/dog/Makefile.am000066400000000000000000000031211237656255000161350ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # MAINTAINERCLEANFILES = Makefile.in AM_CFLAGS = AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include sbin_PROGRAMS = dog dog_SOURCES = farm/object_tree.c farm/sha1_file.c farm/snap.c \ farm/trunk.c farm/farm.c farm/slice.c \ dog.c common.c treeview.c vdi.c node.c cluster.c if BUILD_TRACE dog_SOURCES += trace.c override CFLAGS := $(subst -pg,,$(CFLAGS)) endif dog_LDADD = ../lib/libsheepdog.a -lpthread dog_DEPENDENCIES = ../lib/libsheepdog.a noinst_HEADERS = treeview.h dog.h farm/farm.h EXTRA_DIST = install-exec-hook: if [ -z "${DESTDIR}" ];then $(LN_S) -f ${sbindir}/dog ${sbindir}/collie;fi uninstall-hook: rm -f ${sbindir}/collie all-local: @echo Built dog clean-local: rm -f dog *.o gmon.out *.da *.bb *.bbg # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(dog_SOURCES) $(noinst_HEADERS) coverage: @lcov -d . -c -o dog.info sheepdog-0.8.3/dog/cluster.c000066400000000000000000000355621237656255000157440ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "dog.h" #include "farm/farm.h" static struct sd_option cluster_options[] = { {'b', "store", true, "specify backend store"}, {'c', "copies", true, "specify the default data redundancy (number of copies)"}, {'f', "force", false, "do not prompt for confirmation"}, {'t', "strict", false, "do not serve write request if number of nodes is not sufficient"}, {'s', "backend", false, "show backend store information"}, { 0, NULL, false, NULL }, }; static struct cluster_cmd_data { uint8_t copies; uint8_t copy_policy; bool force; bool show_store; bool strict; char name[STORE_LEN]; } cluster_cmd_data; #define DEFAULT_STORE "plain" static int list_store(void) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char buf[512] = { 0 }; sd_init_req(&hdr, SD_OP_GET_STORE_LIST); hdr.data_length = 512; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Restore failed: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } printf("Available stores:\n"); printf("---------------------------------------\n"); printf("%s\n", buf); return EXIT_SYSFAIL; } static bool no_vdi(const unsigned long *vdis) { return find_next_bit(vdis, SD_NR_VDIS, 0) == SD_NR_VDIS; } #define FORMAT_PRINT \ " __\n" \ " ()'`;\n" \ " /\\|`\n" \ " / | Caution! The cluster is not empty.\n" \ "(/_)_|_ Are you sure you want to continue? [yes/no]: " static int cluster_format(int argc, char **argv) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct timeval tv; char store_name[STORE_LEN]; static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); if (cluster_cmd_data.copies > sd_nodes_nr) { char info[1024]; snprintf(info, sizeof(info), "Number of copies (%d) is larger " "than number of nodes (%d).\n" "Are you sure you want to continue? [yes/no]: ", cluster_cmd_data.copies, sd_nodes_nr); confirm(info); } sd_init_req(&hdr, SD_OP_READ_VDIS); hdr.data_length = sizeof(vdi_inuse); ret = dog_exec_req(&sd_nid, &hdr, vdi_inuse); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } if (!no_vdi(vdi_inuse)) confirm(FORMAT_PRINT); gettimeofday(&tv, NULL); sd_init_req(&hdr, SD_OP_MAKE_FS); hdr.cluster.copies = cluster_cmd_data.copies; hdr.cluster.copy_policy = cluster_cmd_data.copy_policy; hdr.cluster.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; if (strlen(cluster_cmd_data.name)) pstrcpy(store_name, STORE_LEN, cluster_cmd_data.name); else pstrcpy(store_name, STORE_LEN, DEFAULT_STORE); hdr.data_length = strlen(store_name) + 1; hdr.flags |= SD_FLAG_CMD_WRITE; if (cluster_cmd_data.strict) hdr.cluster.flags |= SD_CLUSTER_FLAG_STRICT; printf("using backend %s store\n", store_name); ret = dog_exec_req(&sd_nid, &hdr, store_name); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Format failed: %s", sd_strerror(rsp->result)); if (rsp->result == SD_RES_NO_STORE) return list_store(); else return EXIT_SYSFAIL; } return EXIT_SUCCESS; } static int cluster_info(int argc, char **argv) { int i, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct epoch_log *logs; int nr_logs, log_length; time_t ti, ct; struct tm tm; char time_str[128]; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; /* show cluster status */ if (!raw_output) printf("Cluster status: "); if (rsp->result == SD_RES_SUCCESS) printf("running, auto-recovery %s\n", logs->disable_recovery ? "disabled" : "enabled"); else printf("%s\n", sd_strerror(rsp->result)); /* show cluster backend store */ if (cluster_cmd_data.show_store) { if (!raw_output) printf("Cluster store: "); if (rsp->result == SD_RES_SUCCESS) { char copy[10]; int data, parity; if (!logs->copy_policy) snprintf(copy, sizeof(copy), "%d", logs->nr_copies); else { ec_policy_to_dp(logs->copy_policy, &data, &parity); snprintf(copy, sizeof(copy), "%d:%d", data, parity); } printf("%s with %s redundancy policy\n", logs->drv_name, copy); } else printf("%s\n", sd_strerror(rsp->result)); } if (!raw_output && rsp->data_length > 0) { ct = logs[0].ctime >> 32; printf("\nCluster created at %s\n", ctime(&ct)); printf("Epoch Time Version\n"); } nr_logs = rsp->data_length / sizeof(struct epoch_log); for (i = 0; i < nr_logs; i++) { int j; const struct sd_node *entry; ti = logs[i].time; if (raw_output) { snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti); } else { localtime_r(&ti, &tm); strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm); } printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch); printf(" ["); for (j = 0; j < logs[i].nr_nodes; j++) { entry = logs[i].nodes + j; printf("%s%s", (j == 0) ? "" : ", ", addr_to_str(entry->nid.addr, entry->nid.port)); } printf("]\n"); } free(logs); return EXIT_SUCCESS; error: free(logs); return EXIT_SYSFAIL; } static int cluster_shutdown(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_SHUTDOWN); ret = send_light_req(&sd_nid, &hdr); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static void print_list(void *buf, unsigned len) { struct snap_log *log_buf = (struct snap_log *)buf; unsigned nr = len / sizeof(struct snap_log); printf("Index\t\tTag\t\tSnapshot Time\n"); for (unsigned i = 0; i < nr; i++, log_buf++) { time_t *t = (time_t *)&log_buf->time; printf("%d\t\t", log_buf->idx); printf("%s\t\t", log_buf->tag); printf("%s", ctime(t)); } } static int list_snapshot(int argc, char **argv) { const char *path = argv[optind++]; void *buf = NULL; int log_nr; int ret = EXIT_SYSFAIL; if (farm_init(path) != SD_RES_SUCCESS) goto out; buf = snap_log_read(&log_nr); if (!buf) goto out; print_list(buf, log_nr * sizeof(struct snap_log)); ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to list snapshot."); free(buf); return ret; } static void fill_cb(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct sd_inode *inode = (struct sd_inode *)arg; uint64_t oid; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (ext->vdi_id) { oid = vid_to_data_oid(ext->vdi_id, ext->idx); object_tree_insert(oid, inode->nr_copies, inode->copy_policy); } } } static void fill_object_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { uint64_t vdi_oid = vid_to_vdi_oid(vid), vmstate_oid; uint32_t vdi_id; uint32_t nr_objs, nr_vmstate_object; /* ignore active vdi */ if (!vdi_is_snapshot(i)) return; /* fill vdi object id */ object_tree_insert(vdi_oid, i->nr_copies, i->copy_policy); /* fill data object id */ if (i->store_policy == 0) { nr_objs = count_data_objs(i); for (uint32_t idx = 0; idx < nr_objs; idx++) { vdi_id = INODE_GET_VID(i, idx); if (!vdi_id) continue; uint64_t oid = vid_to_data_oid(vdi_id, idx); object_tree_insert(oid, i->nr_copies, i->copy_policy); } } else traverse_btree(dog_bnode_reader, i, fill_cb, &i); /* fill vmstate object id */ nr_vmstate_object = DIV_ROUND_UP(i->vm_state_size, SD_DATA_OBJ_SIZE); for (uint32_t idx = 0; idx < nr_vmstate_object; idx++) { vmstate_oid = vid_to_vmstate_oid(vid, idx); object_tree_insert(vmstate_oid, i->nr_copies, i->copy_policy); } } static int save_snapshot(int argc, char **argv) { const char *tag = argv[optind++]; char *path, *p; int ret = EXIT_SYSFAIL, uninitialized_var(unused); unused = strtol(tag, &p, 10); if (tag != p) { sd_err("Tag should not start with number."); return EXIT_USAGE; } if (!argv[optind]) { sd_err("Please specify the path to save snapshot."); return EXIT_USAGE; } path = argv[optind]; if (farm_init(path) != SD_RES_SUCCESS) goto out; if (farm_contain_snapshot(0, tag)) { sd_err("Snapshot tag has already been used for another" " snapshot, please, use another one."); goto out; } if (parse_vdi(fill_object_tree, SD_INODE_SIZE, NULL) != SD_RES_SUCCESS) goto out; if (farm_save_snapshot(tag) != SD_RES_SUCCESS) goto out; ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to save snapshot to path: %s.", path); object_tree_free(); return ret; } static int load_snapshot(int argc, char **argv) { char *tag = argv[optind++]; char *path, *p; uint32_t idx; int ret = EXIT_SYSFAIL; idx = strtol(tag, &p, 10); if (tag == p) idx = 0; if (!argv[optind]) { sd_err("Please specify the path to save snapshot."); return EXIT_USAGE; } path = argv[optind]; if (farm_init(path) != SD_RES_SUCCESS) goto out; if (!farm_contain_snapshot(idx, tag)) { sd_err("Snapshot index or tag does not exist."); goto out; } if (cluster_format(0, NULL) != SD_RES_SUCCESS) goto out; if (farm_load_snapshot(idx, tag) != SD_RES_SUCCESS) goto out; ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to load snapshot"); return ret; } #define RECOVER_PRINT \ "Caution! Please try starting all the cluster nodes normally before\n" \ "running this command.\n\n" \ "The cluster may need to be force recovered if:\n" \ " - the master node fails to start because of epoch mismatch; or\n" \ " - some nodes fail to start after a cluster shutdown.\n\n" \ "Are you sure you want to continue? [yes/no]: " static int cluster_force_recover(int argc, char **argv) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char str[123] = {'\0'}; struct sd_node nodes[SD_MAX_NODES]; if (!cluster_cmd_data.force) { int i, l; printf(RECOVER_PRINT); ret = scanf("%s", str); if (ret < 0) return EXIT_SYSFAIL; l = strlen(str); for (i = 0; i < l; i++) str[i] = tolower(str[i]); if (strncmp(str, "yes", 3) != 0) return EXIT_SUCCESS; } sd_init_req(&hdr, SD_OP_FORCE_RECOVER); hdr.data_length = sizeof(nodes); ret = dog_exec_req(&sd_nid, &hdr, nodes); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to execute request, %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int cluster_disable_recover(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_DISABLE_RECOVER); ret = send_light_req(&sd_nid, &hdr); if (ret) return EXIT_FAILURE; printf("Cluster recovery: disable\n"); return EXIT_SUCCESS; } static int cluster_enable_recover(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_ENABLE_RECOVER); ret = send_light_req(&sd_nid, &hdr); if (ret) return EXIT_FAILURE; printf("Cluster recovery: enable\n"); return EXIT_SUCCESS; } /* Subcommand list of recover */ static struct subcommand cluster_recover_cmd[] = { {"force", NULL, NULL, "force recover cluster immediately", NULL, 0, cluster_force_recover}, {"enable", NULL, NULL, "enable automatic recovery and " "run once recover if necessary", NULL, 0, cluster_enable_recover}, {"disable", NULL, NULL, "disable automatic recovery", NULL, 0, cluster_disable_recover}, {NULL}, }; static int cluster_recover(int argc, char **argv) { return do_generic_subcommand(cluster_recover_cmd, argc, argv); } /* Subcommand list of snapshot */ static struct subcommand cluster_snapshot_cmd[] = { {"save", NULL, "h", "save snapshot to localpath", NULL, CMD_NEED_ARG|CMD_NEED_NODELIST, save_snapshot, NULL}, {"list", NULL, "h", "list snapshot of localpath", NULL, CMD_NEED_ARG, list_snapshot, NULL}, {"load", NULL, "h", "load snapshot from localpath", NULL, CMD_NEED_ARG, load_snapshot, NULL}, {NULL}, }; static int cluster_snapshot(int argc, char **argv) { return do_generic_subcommand(cluster_snapshot_cmd, argc, argv); } static int cluster_reweight(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_REWEIGHT); ret = send_light_req(&sd_nid, &hdr); if (ret) return EXIT_FAILURE; return EXIT_SUCCESS; } static void cluster_check_cb(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *inode, void *data) { if (vdi_is_snapshot(inode)) printf("fix snapshot %s (id: %d, tag: \"%s\")\n", name, snapid, tag); else printf("fix vdi %s\n", name); do_vdi_check(inode); } static int cluster_check(int argc, char **argv) { if (parse_vdi(cluster_check_cb, SD_INODE_SIZE, NULL) < 0) return EXIT_SYSFAIL; return EXIT_SUCCESS; } static struct subcommand cluster_cmd[] = { {"info", NULL, "aprhs", "show cluster information", NULL, CMD_NEED_NODELIST, cluster_info, cluster_options}, {"format", NULL, "bctaph", "create a Sheepdog store", NULL, CMD_NEED_NODELIST, cluster_format, cluster_options}, {"shutdown", NULL, "aph", "stop Sheepdog", NULL, 0, cluster_shutdown, cluster_options}, {"snapshot", " ", "aph", "snapshot/restore the cluster", cluster_snapshot_cmd, CMD_NEED_ARG, cluster_snapshot, cluster_options}, {"recover", NULL, "afph", "See 'dog cluster recover' for more information", cluster_recover_cmd, CMD_NEED_ARG, cluster_recover, cluster_options}, {"reweight", NULL, "aph", "reweight the cluster", NULL, 0, cluster_reweight, cluster_options}, {"check", NULL, "aph", "check and repair cluster", NULL, CMD_NEED_NODELIST, cluster_check, cluster_options}, {NULL,}, }; static int cluster_parser(int ch, const char *opt) { switch (ch) { case 'b': pstrcpy(cluster_cmd_data.name, sizeof(cluster_cmd_data.name), opt); break; case 'c': cluster_cmd_data.copies = parse_copy(opt, &cluster_cmd_data.copy_policy); if (!cluster_cmd_data.copies) { sd_err("Invalid parameter %s\n" "To create replicated vdi, set -c x\n" " x(1 to %d) - number of replicated copies\n" "To create erasure coded vdi, set -c x:y\n" " x(2,4,8,16) - number of data strips\n" " y(1 to 15) - number of parity strips", opt, SD_MAX_COPIES); exit(EXIT_FAILURE); } break; case 'f': cluster_cmd_data.force = true; break; case 's': cluster_cmd_data.show_store = true; break; case 't': cluster_cmd_data.strict = true; break; } return 0; } struct command cluster_command = { "cluster", cluster_cmd, cluster_parser }; sheepdog-0.8.3/dog/common.c000066400000000000000000000244431237656255000155470ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "dog.h" #include "sha1.h" #include "sockfd_cache.h" #include "fec.h" char *strnumber_raw(uint64_t _size, bool raw) { const char *units[] = {"MB", "GB", "TB", "PB", "EB", "ZB", "YB"}; static __thread struct size_str { char str[UINT64_DECIMAL_SIZE]; } s[1024]; /* Is this big enough ? */ static int j; int i = 0; double size; char *ret; if (raw) { snprintf(s[j].str, UINT64_DECIMAL_SIZE, "%" PRIu64, _size); goto out; } size = (double)_size; size /= 1024 * 1024; while (i < ARRAY_SIZE(units) - 1 && size >= 1024) { i++; size /= 1024; } if (size >= 10) snprintf(s[j].str, UINT64_DECIMAL_SIZE, "%.0lf %s", size, units[i]); else snprintf(s[j].str, UINT64_DECIMAL_SIZE, "%.1lf %s", size, units[i]); out: ret = s[j++].str; if (j == 1024) j = 0; return ret; } char *strnumber(uint64_t size) { return strnumber_raw(size, raw_output); } int dog_read_object(uint64_t oid, void *data, unsigned int datalen, uint64_t offset, bool direct) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.data_length = datalen; hdr.obj.oid = oid; hdr.obj.offset = offset; if (direct) hdr.flags |= SD_FLAG_CMD_DIRECT; ret = dog_exec_req(&sd_nid, &hdr, data); if (ret < 0) { sd_err("Failed to read object %" PRIx64, oid); return SD_RES_EIO; } if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx64 " %s", oid, sd_strerror(rsp->result)); return rsp->result; } return SD_RES_SUCCESS; } int dog_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen, uint64_t offset, uint32_t flags, uint8_t copies, uint8_t copy_policy, bool create, bool direct) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.data_length = datalen; hdr.flags = flags | SD_FLAG_CMD_WRITE; if (cow_oid) hdr.flags |= SD_FLAG_CMD_COW; if (direct) hdr.flags |= SD_FLAG_CMD_DIRECT; hdr.obj.copies = copies; hdr.obj.copy_policy = copy_policy; hdr.obj.oid = oid; hdr.obj.cow_oid = cow_oid; hdr.obj.offset = offset; ret = dog_exec_req(&sd_nid, &hdr, data); if (ret < 0) { sd_err("Failed to write object %" PRIx64, oid); return SD_RES_EIO; } if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to write object %" PRIx64 ": %s", oid, sd_strerror(rsp->result)); return rsp->result; } return SD_RES_SUCCESS; } #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS) int parse_vdi(vdi_parser_func_t func, size_t size, void *data) { int ret; unsigned long nr; static struct sd_inode i; struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); uint32_t rlen; sd_init_req(&req, SD_OP_READ_VDIS); req.data_length = sizeof(vdi_inuse); ret = dog_exec_req(&sd_nid, &req, vdi_inuse); if (ret < 0) goto out; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); goto out; } FOR_EACH_VDI(nr, vdi_inuse) { uint64_t oid; uint32_t snapid; oid = vid_to_vdi_oid(nr); /* for B-tree inode, we also need sd_extent_header */ ret = dog_read_object(oid, &i, SD_INODE_HEADER_SIZE + sizeof(struct sd_extent_header), 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read inode header"); continue; } if (i.name[0] == '\0') /* this VDI has been deleted */ continue; if (size > SD_INODE_HEADER_SIZE) { rlen = sd_inode_get_meta_size(&i, size); ret = dog_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE, rlen, SD_INODE_HEADER_SIZE, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read inode"); continue; } } snapid = vdi_is_snapshot(&i) ? i.snap_id : 0; func(i.vdi_id, i.name, i.tag, snapid, 0, &i, data); } out: return ret; } int dog_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf) { struct sockfd *sfd; int ret; sfd = sockfd_cache_get(nid); if (!sfd) return -1; /* * Retry forever for dog because * 1. We can't get the newest epoch * 2. Some operations might take unexpected long time */ ret = exec_req(sfd->fd, hdr, buf, NULL, 0, UINT32_MAX); sockfd_cache_put(nid, sfd); return ret ? -1 : 0; } /* Light request only contains header, without body content. */ int send_light_req(const struct node_id *nid, struct sd_req *hdr) { int ret = dog_exec_req(nid, hdr, NULL); struct sd_rsp *rsp = (struct sd_rsp *)hdr; if (ret == -1) return -1; if (rsp->result != SD_RES_SUCCESS) { sd_err("Response's result: %s", sd_strerror(rsp->result)); return -1; } return 0; } int subcmd_depth = -1; struct subcommand *subcmd_stack[MAX_SUBCMD_DEPTH]; int do_generic_subcommand(struct subcommand *sub, int argc, char **argv) { int i, ret; if (subcmd_depth + 1 == MAX_SUBCMD_DEPTH) { sd_err("Too deep netsted subcommands, " "please expand MAX_SUBCMD_DEPTH"); exit(EXIT_USAGE); } subcmd_stack[++subcmd_depth] = sub; for (i = 0; sub[i].name; i++) { unsigned long flags; if (strcmp(sub[i].name, argv[optind])) continue; flags = sub[i].flags; if (flags & CMD_NEED_NODELIST) { ret = update_node_list(SD_MAX_NODES); if (ret < 0) { sd_err("Failed to get node list"); exit(EXIT_SYSFAIL); } } if (flags & CMD_NEED_ARG && argc < 5 + subcmd_depth) subcommand_usage(argv[1], argv[2], EXIT_USAGE); optind++; ret = sub[i].fn(argc, argv); if (ret == EXIT_USAGE) subcommand_usage(argv[1], argv[2], EXIT_USAGE); return ret; } subcommand_usage(argv[1], argv[2], EXIT_FAILURE); subcmd_depth--; return EXIT_FAILURE; } void confirm(const char *message) { char input[8] = ""; char *ret; printf("%s", message); ret = fgets(input, sizeof(input), stdin); if (ret == NULL || strncasecmp(input, "yes", 3) != 0) exit(EXIT_SUCCESS); } void work_queue_wait(struct work_queue *q) { while (!work_queue_empty(q)) event_loop(-1); } #define DEFAULT_SCREEN_WIDTH 80 static int get_screen_width(void) { struct winsize wsz; if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &wsz) < 0) return DEFAULT_SCREEN_WIDTH; return wsz.ws_col; } /* * Show prograss bar as follows. * * 45.0 % [===============> ] 180 MB / 400 MB */ void show_progress(uint64_t done, uint64_t total, bool raw) { int screen_width = get_screen_width(); int bar_length = screen_width - 30; char *buf; if (!is_stdout_console()) return; if (screen_width <= 0) return; printf("\r"); /* move to the beginning of the line */ buf = xmalloc(screen_width + 1); snprintf(buf, screen_width, "%5.1lf %% [", (double)done / total * 100); for (int i = 0; i < bar_length; i++) { if (total * (i + 1) / bar_length <= done) strcat(buf, "="); else if (total * i / bar_length <= done && done < total * (i + 1) / bar_length) strcat(buf, ">"); else strcat(buf, " "); } snprintf(buf + strlen(buf), screen_width - strlen(buf), "] %s / %s", strnumber_raw(done, raw), strnumber_raw(total, raw)); /* fill the rest of buffer with blank characters */ memset(buf + strlen(buf), ' ', screen_width - strlen(buf)); buf[screen_width] = '\0'; printf("%s", buf); if (done == total) printf("\n"); fflush(stdout); free(buf); } size_t get_store_objsize(uint8_t copy_policy, uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (is_vdi_btree_obj(oid)) return SD_INODE_DATA_INDEX_SIZE; if (copy_policy != 0) { int d; ec_policy_to_dp(copy_policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); } bool is_erasure_oid(uint64_t oid, uint8_t policy) { if (is_vdi_obj(oid)) return false; if (is_vdi_btree_obj(oid)) return false; if (policy == 0) return false; return true; } static const char * const loglevel_table[] = { "emerg", "alert", "crit", "err", "warning", "notice", "info", "debug", }; /* index is log level */ int do_loglevel_set(const struct node_id *nid, const char *loglevel_str) { int32_t loglevel = loglevel_str2num(loglevel_str); int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; if (loglevel == -1) return EXIT_USAGE; sd_init_req(&hdr, SD_OP_SET_LOGLEVEL); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(loglevel); ret = dog_exec_req(nid, &hdr, &loglevel); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) return EXIT_FAILURE; return EXIT_SUCCESS; } int do_loglevel_get(const struct node_id *nid, int32_t *ret_loglevel) { int32_t loglevel = -1; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_GET_LOGLEVEL); hdr.data_length = sizeof(loglevel); ret = dog_exec_req(nid, &hdr, &loglevel); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) return EXIT_FAILURE; *ret_loglevel = loglevel; return EXIT_SUCCESS; } const char *loglevel_to_str(int loglevel) { for (int i = 0; i < ARRAY_SIZE(loglevel_table); i++) { if (i == loglevel) return loglevel_table[i]; } return "unknown loglevel"; } void dump_loglevels(bool err) { for (int i = 0; i < ARRAY_SIZE(loglevel_table); i++) { if (err) sd_err("%s\t(%d)", loglevel_table[i], i); else sd_info("%s\t(%d)", loglevel_table[i], i); } } /* Return 0 to indicate ill str */ uint8_t parse_copy(const char *str, uint8_t *copy_policy) { char *n1, *n2; uint8_t copy, parity; char p[10]; strcpy(p, str); n1 = strtok(p, ":"); n2 = strtok(NULL, ":"); if ((!n1 || !is_numeric(n1)) || (n2 && !is_numeric(n2))) return 0; copy = strtol(n1, NULL, 10); if (copy > SD_MAX_COPIES) return 0; if (!n2) { *copy_policy = 0; return copy; } if (copy != 2 && copy != 4 && copy != 8 && copy != 16) return 0; parity = strtol(n2, NULL, 10); if (parity >= SD_EC_MAX_STRIP || parity == 0) return 0; /* * 4 bits for parity and 4 bits for data. * We have to compress upper data bits because it can't represent 16 */ *copy_policy = ((copy / 2) << 4) + parity; copy = copy + parity; return copy; } sheepdog-0.8.3/dog/dog.c000066400000000000000000000265541237656255000150350ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "dog.h" #include "util.h" #include "sockfd_cache.h" #define EPOLL_SIZE 4096 static const char program_name[] = "dog"; struct node_id sd_nid = { /* default sdhost is "127.0.0.1" */ .addr = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 0, 0, 1 }, .port = SD_LISTEN_PORT, }; bool highlight = true; bool raw_output; bool verbose; static const struct sd_option dog_options[] = { /* common options for all dog commands */ {'a', "address", true, "specify the daemon address (default: localhost)"}, {'p', "port", true, "specify the daemon port"}, {'r', "raw", false, "raw output mode: omit headers, separate fields with\n" " single spaces and print all sizes in decimal bytes"}, {'v', "verbose", false, "print more information than default"}, {'h', "help", false, "display this help and exit"}, { 0, NULL, false, NULL }, }; static void usage(const struct command *commands, int status); uint32_t sd_epoch; int sd_nodes_nr; struct rb_root sd_vroot = RB_ROOT; struct rb_root sd_nroot = RB_ROOT; int sd_zones_nr; /* a number of zones never exceeds a number of nodes */ static uint32_t sd_zones[SD_MAX_NODES]; int update_node_list(int max_nodes) { int ret; unsigned int size; struct sd_node *buf = NULL; struct sd_node *ent; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; size = sizeof(*ent) * max_nodes; buf = xzalloc(size); sd_init_req(&hdr, SD_OP_GET_NODE_LIST); hdr.data_length = size; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) goto out; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to update node list: %s", sd_strerror(rsp->result)); ret = -1; goto out; } size = rsp->data_length; sd_nodes_nr = size / sizeof(*ent); if (sd_nodes_nr == 0) { sd_err("There are no active sheep daemons"); exit(EXIT_FAILURE); } /* FIXME */ if (sd_nodes_nr > max_nodes) { ret = -1; goto out; } for (int i = 0; i < sd_nodes_nr; i++) { struct sd_node *n = xmalloc(sizeof(*n)); int j; *n = buf[i]; rb_insert(&sd_nroot, n, rb, node_cmp); for (j = 0; j < sd_zones_nr; j++) { if (sd_zones[j] == n->zone) break; } if (j == sd_zones_nr) sd_zones[sd_zones_nr++] = n->zone; } nodes_to_vnodes(&sd_nroot, &sd_vroot); sd_epoch = hdr.epoch; out: if (buf) free(buf); return ret; } static int (*command_parser)(int, const char *); static int (*command_fn)(int, char **); static const char *command_opts; static const char *command_arg; static const char *command_desc; static const struct sd_option *command_options; static const struct sd_option *find_opt(int ch) { const struct sd_option *opt; /* search for common options */ sd_for_each_option(opt, dog_options) { if (opt->ch == ch) return opt; } /* search for self options */ if (command_options) { sd_for_each_option(opt, command_options) { if (opt->ch == ch) return opt; } } sd_err("Internal error"); exit(EXIT_SYSFAIL); } static void init_commands(const struct command **commands) { static struct command *cmds; struct command command_list[] = { vdi_command, node_command, cluster_command, trace_command, {NULL,} }; if (!cmds) { cmds = (struct command *)xmalloc(sizeof(command_list)); memcpy(cmds, command_list, sizeof(command_list)); } *commands = cmds; return; } static const struct subcommand *find_subcmd(const char *cmd, const char *subcmd) { int i, j; const struct command *commands; const struct subcommand *sub; init_commands(&commands); for (i = 0; commands[i].name; i++) { if (!strcmp(commands[i].name, cmd)) { sub = commands[i].sub; for (j = 0; sub[j].name; j++) { if (!strcmp(sub[j].name, subcmd)) return &sub[j]; } } } return NULL; } static unsigned long setup_commands(const struct command *commands, const char *cmd, const char *subcmd) { int i; bool found = false; const struct subcommand *s; unsigned long flags = 0; for (i = 0; commands[i].name; i++) { if (!strcmp(commands[i].name, cmd)) { found = true; if (commands[i].parser) command_parser = commands[i].parser; break; } } if (!found) { if (cmd && strcmp(cmd, "help") && strcmp(cmd, "--help") && strcmp(cmd, "-h")) { sd_err("Invalid command '%s'", cmd); usage(commands, EXIT_USAGE); } usage(commands, 0); } for (s = commands[i].sub; subcmd && s->name; s++) { if (!strcmp(s->name, subcmd)) { command_fn = s->fn; command_opts = s->opts; command_arg = s->arg; command_desc = s->desc; command_options = s->options; flags = s->flags; break; } } if (!command_fn) { if (subcmd && strcmp(subcmd, "help") && strcmp(subcmd, "--help") && strcmp(subcmd, "-h")) sd_err("Invalid command '%s %s'", cmd, subcmd); sd_err("Available %s commands:", cmd); for (s = commands[i].sub; s->name; s++) sd_err(" %s %s", cmd, s->name); exit(EXIT_USAGE); } return flags; } static void usage(const struct command *commands, int status) { int i; const struct subcommand *s; char name[64]; if (status) sd_err("Try '%s --help' for more information.", program_name); else { printf("Sheepdog administrator utility (version %s)\n", PACKAGE_VERSION); printf("Usage: %s [options]\n", program_name); printf("\nAvailable commands:\n"); for (i = 0; commands[i].name; i++) { for (s = commands[i].sub; s->name; s++) { snprintf(name, sizeof(name), "%s %s", commands[i].name, s->name); printf(" %-24s%s\n", name, s->desc); } } printf("\n"); printf("For more information, run " "'%s --help'.\n", program_name); } exit(status); } void subcommand_usage(char *cmd, char *subcmd, int status) { int i, n, len = strlen(command_opts); const struct sd_option *sd_opt; const struct subcommand *sub, *subsub; char name[64]; printf("Usage: %s %s %s", program_name, cmd, subcmd); if (0 <= subcmd_depth) { for (i = 0; i < subcmd_depth + 1; i++) printf(" %s", subcmd_stack[i]->name); subsub = subcmd_stack[i - 1]->sub; } else { sub = find_subcmd(cmd, subcmd); subsub = sub->sub; } if (subsub) { n = 0; while (subsub[n].name) n++; if (n == 1) printf(" %s", subsub[0].name); else if (n > 1) { printf(" {%s", subsub[0].name); for (i = 1; i < n; i++) printf("|%s", subsub[i].name); printf("}"); } } for (i = 0; i < len; i++) { sd_opt = find_opt(command_opts[i]); if (sd_opt->has_arg) printf(" [-%c %s]", sd_opt->ch, sd_opt->name); else printf(" [-%c]", sd_opt->ch); } if (command_arg) printf(" %s", command_arg); printf("\n"); if (subsub) { printf("Available subcommands:\n"); for (i = 0; subsub[i].name; i++) printf(" %-24s%s\n", subsub[i].name, subsub[i].desc); } printf("Options:\n"); for (i = 0; i < len; i++) { sd_opt = find_opt(command_opts[i]); snprintf(name, sizeof(name), "-%c, --%s", sd_opt->ch, sd_opt->name); printf(" %-24s%s\n", name, sd_opt->desc); } exit(status); } static const struct sd_option *build_sd_options(const char *opts) { static struct sd_option sd_opts[256], *p; int i, len = strlen(opts); p = sd_opts; for (i = 0; i < len; i++) *p++ = *find_opt(opts[i]); memset(p, 0, sizeof(struct sd_option)); return sd_opts; } static void crash_handler(int signo) { sd_err("dog exits unexpectedly (%s).", strsignal(signo)); sd_backtrace(); /* * OOM raises SIGABRT in xmalloc but the administrator expects * that dog exits with EXIT_SYSFAIL. We have to give up * dumping a core file in this case. */ if (signo == SIGABRT) exit(EXIT_SYSFAIL); reraise_crash_signal(signo, EXIT_SYSFAIL); } static size_t get_nr_nodes(void) { return sd_nodes_nr; } static void log_dog_operation(int argc, char **argv) { int length = 0, printed = 0; char *msg; const char *custom_log_path; if (!getenv("SHEEPDOG_DOG_LOG")) /* don't log operation of dog */ return; for (int i = 0; i < argc; i++) length += 1 + strlen(argv[i]); /* 1 is for space */ length++; /* 1 is for '\0' */ msg = xcalloc(length, sizeof(char)); for (int i = 0; i < argc; i++) printed += snprintf(msg + printed, length - printed, " %s", argv[i]); custom_log_path = getenv("SHEEPDOG_DOG_LOG_PATH"); if (custom_log_path) { struct timeval tv; struct tm tm; char time_str[256]; int fd; fd = open(custom_log_path, O_WRONLY | O_APPEND | O_CREAT, S_IRUSR | S_IWUSR); if (fd < 0) { fprintf(stderr, "error at opening log file of dog" "(%s): %m\n", custom_log_path); goto out; } gettimeofday(&tv, NULL); localtime_r(&tv.tv_sec, &tm); strftime(time_str, sizeof(time_str), "%Y %b %2d %H:%M:%S ", &tm); dprintf(fd, "%s: %s\n", time_str, msg); close(fd); } else { /* if the path is not specified, we use standard syslog */ openlog("sheepdog admin operation", LOG_PID, LOG_USER); syslog(LOG_INFO, "%s\n", msg); closelog(); } out: free(msg); } int main(int argc, char **argv) { int ch, longindex, ret; unsigned long flags; struct option *long_options; const struct command *commands; const char *short_options; char *p; const struct sd_option *sd_opts; uint8_t sdhost[16]; int sdport; log_dog_operation(argc, argv); install_crash_handler(crash_handler); init_commands(&commands); if (argc < 2) usage(commands, 0); flags = setup_commands(commands, argv[1], argv[2]); optind = 3; sd_opts = build_sd_options(command_opts); long_options = build_long_options(sd_opts); short_options = build_short_options(sd_opts); while ((ch = getopt_long(argc, argv, short_options, long_options, &longindex)) >= 0) { switch (ch) { case 'a': if (!str_to_addr(optarg, sdhost)) { sd_err("Invalid ip address %s", optarg); return EXIT_FAILURE; } memcpy(sd_nid.addr, sdhost, sizeof(sdhost)); break; case 'p': sdport = strtol(optarg, &p, 10); if (optarg == p || sdport < 1 || sdport > UINT16_MAX) { sd_err("Invalid port number '%s'", optarg); exit(EXIT_USAGE); } sd_nid.port = sdport; break; case 'r': raw_output = true; break; case 'v': verbose = true; break; case 'h': subcommand_usage(argv[1], argv[2], EXIT_SUCCESS); break; case '?': usage(commands, EXIT_USAGE); break; default: if (command_parser) command_parser(ch, optarg); else usage(commands, EXIT_USAGE); break; } } if (!is_stdout_console() || raw_output) highlight = false; if (flags & CMD_NEED_NODELIST) { ret = update_node_list(SD_MAX_NODES); if (ret < 0) { sd_err("Failed to get node list"); exit(EXIT_SYSFAIL); } } if (flags & CMD_NEED_ARG && argc == optind) subcommand_usage(argv[1], argv[2], EXIT_USAGE); if (init_event(EPOLL_SIZE) < 0) exit(EXIT_SYSFAIL); if (init_work_queue(get_nr_nodes) != 0) { sd_err("Failed to init work queue"); exit(EXIT_SYSFAIL); } if (sockfd_init()) { sd_err("sockfd_init() failed"); exit(EXIT_SYSFAIL); } ret = command_fn(argc, argv); if (ret == EXIT_USAGE) subcommand_usage(argv[1], argv[2], EXIT_USAGE); return ret; } sheepdog-0.8.3/dog/dog.h000066400000000000000000000075021237656255000150320ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __DOG_H__ #define __DOG_H__ #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "exits.h" #include "option.h" #include "work.h" #include "event.h" #include "config.h" #define CMD_NEED_NODELIST (1 << 0) #define CMD_NEED_ARG (1 << 1) #define UINT64_DECIMAL_SIZE 21 struct command { const char *name; const struct subcommand *sub; int (*parser)(int, const char *); }; struct subcommand { const char *name; const char *arg; const char *opts; const char *desc; const struct subcommand *sub; unsigned long flags; int (*fn)(int, char **); const struct sd_option *options; }; void subcommand_usage(char *cmd, char *subcmd, int status); #define MAX_SUBCMD_DEPTH 8 extern int subcmd_depth; extern struct subcommand *subcmd_stack[MAX_SUBCMD_DEPTH]; extern struct node_id sd_nid; extern bool highlight; extern bool raw_output; extern bool verbose; extern uint32_t sd_epoch; extern struct rb_root sd_vroot; extern struct rb_root sd_nroot; extern int sd_nodes_nr; extern int sd_zones_nr; bool is_current(const struct sd_inode *i); char *strnumber(uint64_t _size); char *strnumber_raw(uint64_t _size, bool raw); typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data); int parse_vdi(vdi_parser_func_t func, size_t size, void *data); int dog_read_object(uint64_t oid, void *data, unsigned int datalen, uint64_t offset, bool direct); int dog_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen, uint64_t offset, uint32_t flags, uint8_t copies, uint8_t, bool create, bool direct); int dog_exec_req(const struct node_id *, struct sd_req *hdr, void *data); int send_light_req(const struct node_id *, struct sd_req *hdr); int do_generic_subcommand(struct subcommand *sub, int argc, char **argv); int update_node_list(int max_nodes); void confirm(const char *message); void work_queue_wait(struct work_queue *q); int do_vdi_create(const char *vdiname, int64_t vdi_size, uint32_t base_vid, uint32_t *vdi_id, bool snapshot, uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy); int do_vdi_check(const struct sd_inode *inode); void show_progress(uint64_t done, uint64_t total, bool raw); size_t get_store_objsize(uint8_t copy_policy, uint64_t oid); bool is_erasure_oid(uint64_t oid, uint8_t policy); uint8_t parse_copy(const char *str, uint8_t *copy_policy); int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len, uint64_t offset, uint32_t flags, int copies, int copy_policy, bool create, bool direct); int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len, uint64_t offset); #define INODE_GET_VID(inode, idx) (sd_inode_get_vid(dog_bnode_reader, \ inode, idx)) #define INODE_SET_VID(inode, idx, vdi_id) (sd_inode_set_vid(dog_bnode_writer, \ dog_bnode_reader, inode, idx, vdi_id)) extern struct command vdi_command; extern struct command node_command; extern struct command cluster_command; #ifdef HAVE_TRACE extern struct command trace_command; #else #define trace_command {} #endif /* HAVE_TRACE */ int do_loglevel_set(const struct node_id *nid, const char *loglevel_str); int do_loglevel_get(const struct node_id *nid, int32_t *ret_loglevel); const char *loglevel_to_str(int loglevel); void dump_loglevels(bool err); #endif sheepdog-0.8.3/dog/farm/000077500000000000000000000000001237656255000150315ustar00rootroot00000000000000sheepdog-0.8.3/dog/farm/farm.c000066400000000000000000000235161237656255000161310ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "farm.h" #include "rbtree.h" static char farm_object_dir[PATH_MAX]; static char farm_dir[PATH_MAX]; static struct sd_rw_lock active_vdi_lock = SD_RW_LOCK_INITIALIZER; static struct sd_rw_lock registered_vdi_lock = SD_RW_LOCK_INITIALIZER; struct registered_vdi_entry { struct rb_node rb; uint32_t vid; }; struct active_vdi_entry { struct rb_node rb; char name[SD_MAX_VDI_LEN]; uint64_t vdi_size; uint32_t vdi_id; uint32_t snap_id; uint8_t nr_copies; uint8_t copy_policy; uint8_t store_policy; }; /* We use active_vdi_tree to create active vdi on top of the snapshot chain */ static struct rb_root active_vdi_tree = RB_ROOT; /* We have to register vdi information first before loading objects */ static struct rb_root registered_vdi_tree = RB_ROOT; struct snapshot_work { struct trunk_entry entry; struct strbuf *trunk_buf; struct work work; }; static struct work_queue *wq; static uatomic_bool work_error; static int vdi_cmp(const struct active_vdi_entry *e1, const struct active_vdi_entry *e2) { return strcmp(e1->name, e2->name); } static void update_active_vdi_entry(struct active_vdi_entry *vdi, struct sd_inode *new) { pstrcpy(vdi->name, sizeof(vdi->name), new->name); vdi->vdi_size = new->vdi_size; vdi->vdi_id = new->vdi_id; vdi->snap_id = new->snap_id; vdi->nr_copies = new->nr_copies; vdi->copy_policy = new->copy_policy; vdi->store_policy = new->store_policy; } static void add_active_vdi(struct sd_inode *new) { struct active_vdi_entry *vdi, *ret; vdi = xmalloc(sizeof(struct active_vdi_entry)); update_active_vdi_entry(vdi, new); sd_write_lock(&active_vdi_lock); ret = rb_insert(&active_vdi_tree, vdi, rb, vdi_cmp); if (ret && ret->snap_id < new->snap_id) { update_active_vdi_entry(ret, new); free(vdi); } sd_rw_unlock(&active_vdi_lock); } static int registered_vdi_cmp(struct registered_vdi_entry *a, struct registered_vdi_entry *b) { return intcmp(a->vid, b->vid); } static bool register_vdi(uint32_t vid) { struct registered_vdi_entry *new = xmalloc(sizeof(*new)), *ret; new->vid = vid; sd_read_lock(®istered_vdi_lock); ret = rb_search(®istered_vdi_tree, new, rb, registered_vdi_cmp); sd_rw_unlock(®istered_vdi_lock); if (ret) { free(new); return false; } sd_write_lock(®istered_vdi_lock); rb_insert(®istered_vdi_tree, new, rb, registered_vdi_cmp); sd_rw_unlock(®istered_vdi_lock); return true; } static int create_active_vdis(void) { struct active_vdi_entry *vdi; uint32_t new_vid; rb_for_each_entry(vdi, &active_vdi_tree, rb) { if (do_vdi_create(vdi->name, vdi->vdi_size, vdi->vdi_id, &new_vid, false, vdi->nr_copies, vdi->copy_policy, vdi->store_policy) < 0) return -1; } return 0; } char *get_object_directory(void) { return farm_object_dir; } static int create_directory(const char *p) { int ret = -1; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, p); if (xmkdir(buf.buf, 0755) < 0) { if (errno == EEXIST) sd_err("Path is not a directory: %s", p); goto out; } if (!strlen(farm_dir)) strbuf_copyout(&buf, farm_dir, sizeof(farm_dir)); strbuf_addstr(&buf, "/objects"); if (xmkdir(buf.buf, 0755) < 0) goto out; for (int i = 0; i < 256; i++) { strbuf_addf(&buf, "/%02x", i); if (xmkdir(buf.buf, 0755) < 0) goto out; strbuf_remove(&buf, buf.len - 3, 3); } if (!strlen(farm_object_dir)) strbuf_copyout(&buf, farm_object_dir, sizeof(farm_object_dir)); ret = 0; out: if (ret) sd_err("Fail to create directory: %m"); strbuf_release(&buf); return ret; } static int get_trunk_sha1(uint32_t idx, const char *tag, unsigned char *outsha1) { int nr_logs = -1, ret = -1; struct snap_log *log_buf, *log_free = NULL; struct snap_file *snap_buf = NULL; log_free = log_buf = snap_log_read(&nr_logs); if (nr_logs < 0) goto out; for (int i = 0; i < nr_logs; i++, log_buf++) { if (log_buf->idx != idx && strcmp(log_buf->tag, tag)) continue; snap_buf = snap_file_read(log_buf->sha1); if (!snap_buf) goto out; memcpy(outsha1, snap_buf->trunk_sha1, SHA1_DIGEST_SIZE); ret = 0; goto out; } out: free(log_free); free(snap_buf); return ret; } static int notify_vdi_add(uint32_t vdi_id, uint8_t nr_copies, uint8_t copy_policy) { int ret = -1; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char *buf = NULL; sd_init_req(&hdr, SD_OP_NOTIFY_VDI_ADD); hdr.vdi_state.new_vid = vdi_id; hdr.vdi_state.copies = nr_copies; hdr.vdi_state.copy_policy = copy_policy; hdr.vdi_state.set_bitmap = true; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) sd_err("Fail to notify vdi add event(%"PRIx32", %d)", vdi_id, nr_copies); if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); ret = -1; } free(buf); return ret; } int farm_init(const char *path) { int ret = -1; if (create_directory(path) < 0) goto out; if (snap_init(farm_dir) < 0) goto out; return 0; out: if (ret) sd_err("Fail to init farm."); return ret; } bool farm_contain_snapshot(uint32_t idx, const char *tag) { unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; return (get_trunk_sha1(idx, tag, trunk_sha1) == 0); } static void do_save_object(struct work *work) { void *buf; size_t size; struct snapshot_work *sw; if (uatomic_is_true(&work_error)) return; sw = container_of(work, struct snapshot_work, work); size = get_objsize(sw->entry.oid); buf = xmalloc(size); if (dog_read_object(sw->entry.oid, buf, size, 0, true) < 0) goto error; if (slice_write(buf, size, sw->entry.sha1) < 0) goto error; free(buf); return; error: free(buf); sd_err("Fail to save object, oid %"PRIx64, sw->entry.oid); uatomic_set_true(&work_error); } static void farm_show_progress(uint64_t done, uint64_t total) { return show_progress(done, total, true); } static void save_object_done(struct work *work) { struct snapshot_work *sw = container_of(work, struct snapshot_work, work); static unsigned long saved; if (uatomic_is_true(&work_error)) goto out; strbuf_add(sw->trunk_buf, &sw->entry, sizeof(struct trunk_entry)); farm_show_progress(uatomic_add_return(&saved, 1), object_tree_size()); out: free(sw); } static int queue_save_snapshot_work(uint64_t oid, uint32_t nr_copies, uint8_t copy_policy, void *data) { struct snapshot_work *sw = xzalloc(sizeof(struct snapshot_work)); struct strbuf *trunk_buf = data; sw->entry.oid = oid; sw->entry.nr_copies = nr_copies; sw->entry.copy_policy = copy_policy; sw->trunk_buf = trunk_buf; sw->work.fn = do_save_object; sw->work.done = save_object_done; queue_work(wq, &sw->work); return 0; } int farm_save_snapshot(const char *tag) { unsigned char snap_sha1[SHA1_DIGEST_SIZE]; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; struct strbuf trunk_buf; void *snap_log = NULL; int log_nr, idx, ret = -1; uint64_t nr_objects = object_tree_size(); snap_log = snap_log_read(&log_nr); if (!snap_log) goto out; idx = log_nr + 1; strbuf_init(&trunk_buf, sizeof(struct trunk_entry) * nr_objects); wq = create_work_queue("save snapshot", WQ_ORDERED); if (for_each_object_in_tree(queue_save_snapshot_work, &trunk_buf) < 0) goto out; work_queue_wait(wq); if (uatomic_is_true(&work_error)) goto out; if (trunk_file_write(nr_objects, (struct trunk_entry *)trunk_buf.buf, trunk_sha1) < 0) goto out; if (snap_file_write(idx, trunk_sha1, snap_sha1) < 0) goto out; if (snap_log_write(idx, tag, snap_sha1) < 0) goto out; ret = 0; out: strbuf_release(&trunk_buf); free(snap_log); return ret; } static void do_load_object(struct work *work) { void *buffer = NULL; size_t size; struct snapshot_work *sw; static unsigned long loaded; uint32_t vid; if (uatomic_is_true(&work_error)) return; sw = container_of(work, struct snapshot_work, work); buffer = slice_read(sw->entry.sha1, &size); if (!buffer) goto error; vid = oid_to_vid(sw->entry.oid); if (register_vdi(vid)) { if (notify_vdi_add(vid, sw->entry.nr_copies, sw->entry.copy_policy) < 0) goto error; } if (dog_write_object(sw->entry.oid, 0, buffer, size, 0, 0, sw->entry.nr_copies, sw->entry.copy_policy, true, true) != 0) goto error; if (is_vdi_obj(sw->entry.oid)) add_active_vdi(buffer); farm_show_progress(uatomic_add_return(&loaded, 1), trunk_get_count()); free(buffer); return; error: free(buffer); sd_err("Fail to load object, oid %"PRIx64, sw->entry.oid); uatomic_set_true(&work_error); } static void load_object_done(struct work *work) { struct snapshot_work *sw = container_of(work, struct snapshot_work, work); free(sw); } static int queue_load_snapshot_work(struct trunk_entry *entry, void *data) { struct snapshot_work *sw = xzalloc(sizeof(struct snapshot_work)); memcpy(&sw->entry, entry, sizeof(struct trunk_entry)); sw->work.fn = do_load_object; sw->work.done = load_object_done; queue_work(wq, &sw->work); return 0; } int farm_load_snapshot(uint32_t idx, const char *tag) { int ret = -1; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; if (get_trunk_sha1(idx, tag, trunk_sha1) < 0) goto out; wq = create_work_queue("load snapshot", WQ_DYNAMIC); if (for_each_entry_in_trunk(trunk_sha1, queue_load_snapshot_work, NULL) < 0) goto out; work_queue_wait(wq); if (uatomic_is_true(&work_error)) goto out; if (create_active_vdis() < 0) goto out; ret = 0; out: rb_destroy(&active_vdi_tree, struct active_vdi_entry, rb); rb_destroy(®istered_vdi_tree, struct registered_vdi_entry, rb); return ret; } sheepdog-0.8.3/dog/farm/farm.h000066400000000000000000000042211237656255000161260ustar00rootroot00000000000000#ifndef FARM_H #define FARM_H #include #include #include #include #include #include #include #include #include #include #include #include "dog.h" #include "sheep.h" #include "strbuf.h" #include "sha1.h" struct trunk_entry { uint64_t oid; uint8_t nr_copies; uint8_t copy_policy; uint8_t reserved[2]; unsigned char sha1[SHA1_DIGEST_SIZE]; }; struct trunk_file { uint64_t nr_entries; struct trunk_entry *entries; }; struct snap_file { int idx; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; }; struct snap_log { uint32_t idx; char tag[SD_MAX_SNAPSHOT_TAG_LEN]; uint64_t time; unsigned char sha1[SHA1_DIGEST_SIZE]; }; /* farm.c */ int farm_init(const char *path); bool farm_contain_snapshot(uint32_t idx, const char *tag); int farm_save_snapshot(const char *tag); int farm_load_snapshot(uint32_t idx, const char *tag); char *get_object_directory(void); /* trunk.c */ int trunk_init(void); int trunk_file_write(uint64_t nr_entries, struct trunk_entry *entries, unsigned char *trunk_sha1); int for_each_entry_in_trunk(unsigned char *trunk_sha1, int (*func)(struct trunk_entry *entry, void *data), void *data); uint64_t trunk_get_count(void); /* snap.c */ int snap_init(const char *path); struct snap_file *snap_file_read(unsigned char *sha1); int snap_file_write(uint32_t idx, unsigned char *trunk_sha1, unsigned char *outsha1); void *snap_log_read(int *out_nr); int snap_log_write(uint32_t idx, const char *tag, unsigned char *sha1); /* sha1_file.c */ int sha1_file_write(void *buf, size_t len, unsigned char *sha1); void *sha1_file_read(const unsigned char *sha1, size_t *size); /* object_tree.c */ int object_tree_size(void); void object_tree_insert(uint64_t oid, uint32_t nr_copies, uint8_t); void object_tree_free(void); void object_tree_print(void); int for_each_object_in_tree(int (*func)(uint64_t oid, uint32_t nr_copies, uint8_t, void *data), void *data); /* slice.c */ int slice_write(void *buf, size_t len, unsigned char *outsha1); void *slice_read(const unsigned char *sha1, size_t *outsize); #endif sheepdog-0.8.3/dog/farm/object_tree.c000066400000000000000000000042461237656255000174700ustar00rootroot00000000000000/* * Copyright (C) 2013 Zelin.io * * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "farm.h" #include "rbtree.h" struct object_tree_entry { uint64_t oid; uint8_t nr_copies; uint8_t copy_policy; struct rb_node node; }; struct object_tree { int nr_objs; struct rb_root root; }; static struct object_tree tree = { .nr_objs = 0, .root = RB_ROOT, }; static struct object_tree_entry *cached_entry; static int object_tree_cmp(const struct object_tree_entry *a, const struct object_tree_entry *b) { return intcmp(a->oid, b->oid); } static struct object_tree_entry *do_insert(struct rb_root *root, struct object_tree_entry *new) { return rb_insert(root, new, node, object_tree_cmp); } void object_tree_insert(uint64_t oid, uint32_t nr_copies, uint8_t copy_policy) { struct rb_root *root = &tree.root; struct object_tree_entry *p = NULL; if (!cached_entry) cached_entry = xzalloc(sizeof(*cached_entry)); cached_entry->oid = oid; cached_entry->nr_copies = nr_copies; cached_entry->copy_policy = copy_policy; rb_init_node(&cached_entry->node); p = do_insert(root, cached_entry); if (!p) { tree.nr_objs++; cached_entry = NULL; } } void object_tree_print(void) { struct object_tree_entry *entry; printf("nr_objs: %d\n", tree.nr_objs); rb_for_each_entry(entry, &tree.root, node) printf("Obj id: %"PRIx64"\n", entry->oid); } void object_tree_free(void) { rb_destroy(&tree.root, struct object_tree_entry, node); free(cached_entry); } int object_tree_size(void) { return tree.nr_objs; } int for_each_object_in_tree(int (*func)(uint64_t oid, uint32_t nr_copies, uint8_t copy_policy, void *data), void *data) { struct object_tree_entry *entry; int ret = -1; rb_for_each_entry(entry, &tree.root, node) { if (func(entry->oid, entry->nr_copies, entry->copy_policy, data) < 0) goto out; } ret = 0; out: return ret; } sheepdog-0.8.3/dog/farm/sha1_file.c000066400000000000000000000063431237656255000170360ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * sha1_file provide us some useful features: * * - Regardless of object type, all objects are all in deflated with zlib, * and have a header that not only specifies their tag, but also size * information about the data in the object. * * - the general consistency of an object can always be tested independently * of the contents or the type of the object: all objects can be validated * by verifying that their hashes match the content of the file. */ #include #include "farm.h" #include "util.h" static void fill_sha1_path(char *pathbuf, const unsigned char *sha1) { int i; for (i = 0; i < SHA1_DIGEST_SIZE; i++) { static const char hex[] = "0123456789abcdef"; unsigned int val = sha1[i]; char *pos = pathbuf + i*2 + (i > 0); *pos++ = hex[val >> 4]; *pos = hex[val & 0xf]; } } static char *sha1_to_path(const unsigned char *sha1) { static __thread char buf[PATH_MAX]; const char *objdir; int len; objdir = get_object_directory(); len = strlen(objdir); /* '/' + sha1(2) + '/' + sha1(38) + '\0' */ memcpy(buf, objdir, len); buf[len] = '/'; buf[len+3] = '/'; buf[len+42] = '\0'; fill_sha1_path(buf + len + 1, sha1); return buf; } static int sha1_buffer_write(const unsigned char *sha1, void *buf, unsigned int size) { char *filename = sha1_to_path(sha1); int fd, ret = 0, len; fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0666); if (fd < 0) { if (errno != EEXIST) { sd_err("failed to open file %s with error: %m", filename); ret = -1; } goto err_open; } len = xwrite(fd, buf, size); if (len != size) { sd_err("%m"); close(fd); return -1; } close(fd); err_open: return ret; } int sha1_file_write(void *buf, size_t len, unsigned char *outsha1) { unsigned char sha1[SHA1_DIGEST_SIZE]; get_buffer_sha1(buf, len, sha1); if (sha1_buffer_write(sha1, buf, len) < 0) return -1; if (outsha1) memcpy(outsha1, sha1, SHA1_DIGEST_SIZE); return 0; } static int verify_sha1_file(const unsigned char *sha1, void *buf, unsigned long len) { unsigned char tmp[SHA1_DIGEST_SIZE]; get_buffer_sha1(buf, len, tmp); if (memcmp((char *)tmp, (char *)sha1, SHA1_DIGEST_SIZE) != 0) { sd_err("failed, %s != %s", sha1_to_hex(sha1), sha1_to_hex(tmp)); return -1; } return 0; } void *sha1_file_read(const unsigned char *sha1, size_t *size) { char *filename = sha1_to_path(sha1); int fd = open(filename, O_RDONLY); struct stat st; void *buf = NULL; if (fd < 0) { perror(filename); return NULL; } if (fstat(fd, &st) < 0) { sd_err("%m"); goto out; } buf = xmalloc(st.st_size); if (!buf) goto out; if (xread(fd, buf, st.st_size) != st.st_size) { free(buf); buf = NULL; goto out; } if (verify_sha1_file(sha1, buf, st.st_size) < 0) { free(buf); buf = NULL; goto out; } *size = st.st_size; out: close(fd); return buf; } sheepdog-0.8.3/dog/farm/slice.c000066400000000000000000000047031237656255000163000ustar00rootroot00000000000000/* * copyright (c) 2013 taobao inc. * * liu yuan * * this program is free software; you can redistribute it and/or * modify it under the terms of the gnu general public license version * 2 as published by the free software foundation. * * you should have received a copy of the gnu general public license * along with this program. if not, see . */ /* * Slice is a fixed chunk of one object to be stored in farm. We slice * the object into smaller chunks to get better deduplication. */ #include #include #include #include "farm.h" #include "strbuf.h" #include "util.h" #include "sheepdog_proto.h" struct slice { unsigned char sha1[SHA1_DIGEST_SIZE]; }; struct slice_file { uint32_t nr_slices; struct slice *slices; }; /* 128k, best empirical value from some tests, but no rationale */ #define SLICE_SIZE (1024*128) int slice_write(void *buf, size_t len, unsigned char *outsha1) { int count = DIV_ROUND_UP(len, SLICE_SIZE); size_t slen = count * SHA1_DIGEST_SIZE; char *sbuf = xmalloc(slen); char *p = buf; for (int i = 0; i < count; i++, p += SLICE_SIZE) { unsigned char sha1[SHA1_DIGEST_SIZE]; size_t wlen = (ssize_t)len - SLICE_SIZE > 0 ? SLICE_SIZE : len; len -= SLICE_SIZE; if (sha1_file_write(p, wlen, sha1) < 0) goto err; memcpy(sbuf + i * SHA1_DIGEST_SIZE, sha1, SHA1_DIGEST_SIZE); } if (sha1_file_write(sbuf, slen, outsha1) < 0) goto err; free(sbuf); return 0; err: free(sbuf); return -1; } static struct slice_file *slice_file_read(const unsigned char *sha1) { size_t size; struct slice_file *slice_file = NULL; void *buf = sha1_file_read(sha1, &size); if (!buf) return NULL; slice_file = xmalloc(sizeof(struct slice_file)); slice_file->nr_slices = size / SHA1_DIGEST_SIZE; slice_file->slices = buf; return slice_file; } void *slice_read(const unsigned char *sha1, size_t *outsize) { struct slice_file *file = slice_file_read(sha1); struct strbuf buf = STRBUF_INIT; void *object; if (!file) goto err; *outsize = 0; for (uint32_t i = 0; i < file->nr_slices; i++) { size_t size; void *sbuf; sbuf = sha1_file_read(file->slices[i].sha1, &size); if (!sbuf) goto err; strbuf_add(&buf, sbuf, size); free(sbuf); *outsize += size; } object = xmalloc(*outsize); strbuf_copyout(&buf, object, *outsize); free(file); strbuf_release(&buf); return object; err: free(file); strbuf_release(&buf); return NULL; } sheepdog-0.8.3/dog/farm/snap.c000066400000000000000000000050071237656255000161400ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* Snap object is the meta data that describes the cluster snapshot. */ #include #include #include #include "farm.h" static char snap_log_path[PATH_MAX]; int snap_init(const char *farm_dir) { int fd, ret = -1; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, farm_dir); strbuf_addf(&buf, "/%s", "user_snap"); if (!strlen(snap_log_path)) strbuf_copyout(&buf, snap_log_path, sizeof(snap_log_path)); fd = open(snap_log_path, O_CREAT | O_EXCL, 0666); if (fd < 0) { if (errno != EEXIST) { sd_err("%m"); goto out; } } ret = 0; close(fd); out: strbuf_release(&buf); return ret; } int snap_log_write(uint32_t idx, const char *tag, unsigned char *sha1) { int fd, ret = -1; struct strbuf buf = STRBUF_INIT; struct snap_log log = { .idx = idx, .time = time(NULL) }; pstrcpy(log.tag, SD_MAX_SNAPSHOT_TAG_LEN, tag); memcpy(log.sha1, sha1, SHA1_DIGEST_SIZE); fd = open(snap_log_path, O_WRONLY | O_APPEND); if (fd < 0) { sd_err("%m"); goto out; } strbuf_reset(&buf); strbuf_add(&buf, &log, sizeof(log)); ret = xwrite(fd, buf.buf, buf.len); if (ret != buf.len) goto out_close; ret = 0; out_close: close(fd); out: strbuf_release(&buf); return ret; } void *snap_log_read(int *out_nr) { struct stat st; void *buffer = NULL; int len, fd; fd = open(snap_log_path, O_RDONLY); if (fd < 0) { sd_err("%m"); goto out; } if (fstat(fd, &st) < 0) { sd_err("%m"); goto out_close; } len = st.st_size; buffer = xmalloc(len); len = xread(fd, buffer, len); if (len != st.st_size) { free(buffer); buffer = NULL; goto out_close; } *out_nr = len / sizeof(struct snap_log); out_close: close(fd); out: return buffer; } struct snap_file *snap_file_read(unsigned char *sha1) { size_t size; return sha1_file_read(sha1, &size); } int snap_file_write(uint32_t idx, unsigned char *trunk_sha1, unsigned char *outsha1) { struct snap_file snap; snap.idx = idx; memcpy(snap.trunk_sha1, trunk_sha1, SHA1_DIGEST_SIZE); return sha1_file_write(&snap, sizeof(struct snap_file), outsha1); } sheepdog-0.8.3/dog/farm/trunk.c000066400000000000000000000037261237656255000163500ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * Trunk object is meta data that describes the structure of the data objects * at the time of snapshot being taken. It ties data objects together into a * flat directory structure. */ #include #include #include #include "farm.h" #include "strbuf.h" #include "list.h" #include "util.h" #include "sheepdog_proto.h" static uint64_t total_count; int trunk_file_write(uint64_t nr_entries, struct trunk_entry *entries, unsigned char *trunk_sha1) { size_t size = sizeof(struct trunk_entry) * nr_entries; return sha1_file_write(entries, size, trunk_sha1); } static struct trunk_file *trunk_file_read(unsigned char *sha1) { size_t size; struct trunk_file *trunk = NULL; void *buf = sha1_file_read(sha1, &size); if (!buf) return NULL; trunk = xmalloc(sizeof(struct trunk_file)); trunk->nr_entries = size / sizeof(struct trunk_entry); trunk->entries = buf; return trunk; } int for_each_entry_in_trunk(unsigned char *trunk_sha1, int (*func)(struct trunk_entry *entry, void *data), void *data) { struct trunk_file *trunk; struct trunk_entry *entry; int ret = -1; trunk = trunk_file_read(trunk_sha1); if (!trunk) { sd_err("failed to read trunk"); return ret; } total_count = trunk->nr_entries; entry = trunk->entries; for (uint64_t i = 0; i < trunk->nr_entries; i++, entry++) { if (func(entry, data) < 0) goto out; } ret = 0; out: free(trunk->entries); free(trunk); return ret; } uint64_t trunk_get_count(void) { return total_count; } sheepdog-0.8.3/dog/node.c000066400000000000000000000366551237656255000152140ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "dog.h" static struct node_cmd_data { bool all_nodes; bool recovery_progress; bool watch; bool local; } node_cmd_data; static void cal_total_vdi_size(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { uint64_t *size = data; if (!vdi_is_snapshot(i)) *size += i->vdi_size; } static int node_list(int argc, char **argv) { struct sd_node *n; int i = 0; if (!raw_output) printf(" Id Host:Port V-Nodes Zone\n"); rb_for_each_entry(n, &sd_nroot, rb) { const char *host = addr_to_str(n->nid.addr, n->nid.port); printf(raw_output ? "%d %s %d %u\n" : "%4d %-20s\t%2d%11u\n", i++, host, n->nr_vnodes, n->zone); } return EXIT_SUCCESS; } static int node_info(int argc, char **argv) { int ret, success = 0, i = 0; uint64_t total_size = 0, total_avail = 0, total_vdi_size = 0; struct sd_node *n; if (!raw_output) printf("Id\tSize\tUsed\tAvail\tUse%%\n"); rb_for_each_entry(n, &sd_nroot, rb) { struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; sd_init_req(&req, SD_OP_STAT_SHEEP); ret = send_light_req(&n->nid, &req); if (!ret) { int ratio = (int)(((double)(rsp->node.store_size - rsp->node.store_free) / rsp->node.store_size) * 100); printf(raw_output ? "%d %s %s %s %d%%\n" : "%2d\t%s\t%s\t%s\t%3d%%\n", i++, strnumber(rsp->node.store_size), strnumber(rsp->node.store_size - rsp->node.store_free), strnumber(rsp->node.store_free), rsp->node.store_size == 0 ? 0 : ratio); success++; } total_size += rsp->node.store_size; total_avail += rsp->node.store_free; } if (success == 0) { sd_err("Cannot get information from any nodes"); return EXIT_SYSFAIL; } if (parse_vdi(cal_total_vdi_size, SD_INODE_HEADER_SIZE, &total_vdi_size) < 0) return EXIT_SYSFAIL; printf(raw_output ? "Total %s %s %s %d%% %s\n" : "Total\t%s\t%s\t%s\t%3d%%\n\n" "Total virtual image size\t%s\n", strnumber(total_size), strnumber(total_size - total_avail), strnumber(total_avail), (int)(((double)(total_size - total_avail) / total_size) * 100), strnumber(total_vdi_size)); return EXIT_SUCCESS; } static int get_recovery_state(struct recovery_state *state) { int ret; struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; sd_init_req(&req, SD_OP_STAT_RECOVERY); req.data_length = sizeof(*state); ret = dog_exec_req(&sd_nid, &req, state); if (ret < 0) { sd_err("Failed to execute request"); return -1; } if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return -1; } return 0; } static int node_recovery_progress(void) { int result; unsigned int prev_nr_total; struct recovery_state rstate; /* * ToDos * * 1. Calculate size of actually copied objects. * For doing this, not so trivial changes for recovery process are * required. * * 2. Print remaining physical time. * Even if it is not so acculate, the information is helpful for * administrators. */ result = get_recovery_state(&rstate); if (result < 0) return EXIT_SYSFAIL; if (!rstate.in_recovery) return EXIT_SUCCESS; do { prev_nr_total = rstate.nr_total; result = get_recovery_state(&rstate); if (result < 0) break; if (!rstate.in_recovery) { show_progress(prev_nr_total, prev_nr_total, true); break; } switch (rstate.state) { case RW_PREPARE_LIST: printf("\rpreparing a checked object list..."); break; case RW_NOTIFY_COMPLETION: printf("\rnotifying a completion of recovery..."); break; case RW_RECOVER_OBJ: show_progress(rstate.nr_finished, rstate.nr_total, true); break; default: panic("unknown state of recovery: %d", rstate.state); break; } sleep(1); } while (true); return result < 0 ? EXIT_SYSFAIL : EXIT_SUCCESS; } static int node_recovery(int argc, char **argv) { struct sd_node *n; int ret, i = 0; if (node_cmd_data.recovery_progress) return node_recovery_progress(); if (!raw_output) { printf("Nodes In Recovery:\n"); printf(" Id Host:Port V-Nodes Zone" " Progress\n"); } rb_for_each_entry(n, &sd_nroot, rb) { struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; struct recovery_state state; memset(&state, 0, sizeof(state)); sd_init_req(&req, SD_OP_STAT_RECOVERY); req.data_length = sizeof(state); ret = dog_exec_req(&n->nid, &req, &state); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } if (state.in_recovery) { const char *host = addr_to_str(n->nid.addr, n->nid.port); if (raw_output) printf("%d %s %d %d %"PRIu64" %"PRIu64"\n", i, host, n->nr_vnodes, n->zone, state.nr_finished, state.nr_total); else printf("%4d %-20s%5d%11d%11.1f%%\n", i, host, n->nr_vnodes, n->zone, 100 * (float)state.nr_finished / state.nr_total); } i++; } return EXIT_SUCCESS; } static struct sd_node *idx_to_node(struct rb_root *nroot, int idx) { struct sd_node *n = rb_entry(rb_first(nroot), struct sd_node, rb); while (idx--) n = rb_entry(rb_next(&n->rb), struct sd_node, rb); return n; } static int node_kill(int argc, char **argv) { int node_id, ret; struct sd_req req; const char *p; struct node_id *nid = NULL; if (node_cmd_data.local) nid = &sd_nid; /* issue kill request to local node */ if (optind < argc) { if (nid) { sd_err("don't use -l option and specify node id at the" " same time"); exit(EXIT_USAGE); } p = argv[optind++]; if (!is_numeric(p)) { sd_err("Invalid node id '%s', please specify a numeric" " value", p); exit(EXIT_USAGE); } node_id = strtol(p, NULL, 10); if (node_id < 0 || node_id >= sd_nodes_nr) { sd_err("Invalid node id '%d'", node_id); exit(EXIT_USAGE); } nid = &idx_to_node(&sd_nroot, node_id)->nid; } if (!nid) { sd_err("please specify -l option or node id"); exit(EXIT_USAGE); } sd_init_req(&req, SD_OP_KILL_NODE); ret = send_light_req(nid, &req); if (ret) { sd_err("Failed to execute request"); exit(EXIT_FAILURE); } return EXIT_SUCCESS; } static int node_stat(int argc, char **argv) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct sd_stat stat, last = { { 0 } }; int ret; bool watch = node_cmd_data.watch ? true : false, first = true; again: sd_init_req(&hdr, SD_OP_STAT); hdr.data_length = sizeof(stat); ret = dog_exec_req(&sd_nid, &hdr, &stat); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to get stat information: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } if (watch) { if (first) { last = stat; first = false; } printf("%s%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t" "%"PRIu64"\t%"PRIu64"\t%s\t%s\t%s\t%s\t%s\n", raw_output ? "" : "Request\tActive\tTotal\tWrite\tRead\tRemove\tFlush\t" "All WR\tAll RD\tWRBW\tRDBW\tRPS\nClient\t", stat.r.gway_active_nr, stat.r.gway_total_nr, stat.r.gway_total_write_nr, stat.r.gway_total_read_nr, stat.r.gway_total_remove_nr, stat.r.gway_total_flush_nr, strnumber(stat.r.gway_total_rx), strnumber(stat.r.gway_total_tx), strnumber(stat.r.gway_total_rx - last.r.gway_total_rx), strnumber(stat.r.gway_total_tx - last.r.gway_total_tx), strnumber_raw(stat.r.gway_total_nr - last.r.gway_total_nr, true)); printf("%s%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t" "%"PRIu64"\t%"PRIu64"\t%s\t%s\t%s\t%s\t%s\n", raw_output ? "" : "Peer\t", stat.r.peer_active_nr, stat.r.peer_total_nr, stat.r.peer_total_write_nr, stat.r.peer_total_read_nr, stat.r.peer_total_remove_nr, 0UL, strnumber(stat.r.peer_total_rx), strnumber(stat.r.peer_total_tx), strnumber(stat.r.peer_total_rx - last.r.peer_total_rx), strnumber(stat.r.peer_total_tx - last.r.peer_total_tx), strnumber_raw(stat.r.peer_total_nr - last.r.peer_total_nr, true)); last = stat; sleep(1); goto again; } else { printf("%s%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t" "%"PRIu64"\t%"PRIu64"\t%s\t%s\n", raw_output ? "" : "Request\tActive\tTotal\tWrite\tRead\tRemove\tFlush\t" "All WR\tAll RD\nClient\t", stat.r.gway_active_nr, stat.r.gway_total_nr, stat.r.gway_total_read_nr, stat.r.gway_total_write_nr, stat.r.gway_total_remove_nr, stat.r.gway_total_flush_nr, strnumber(stat.r.gway_total_rx), strnumber(stat.r.gway_total_tx)); printf("%s%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t" "%"PRIu64"\t%"PRIu64"\t%s\t%s\n", raw_output ? "" : "Peer\t", stat.r.peer_active_nr, stat.r.peer_total_nr, stat.r.peer_total_read_nr, stat.r.peer_total_write_nr, stat.r.peer_total_remove_nr, 0UL, strnumber(stat.r.peer_total_rx), strnumber(stat.r.peer_total_tx)); } return EXIT_SUCCESS; } static int node_md_info(struct node_id *nid) { struct sd_md_info info = {}; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret, i; sd_init_req(&hdr, SD_OP_MD_INFO); hdr.data_length = sizeof(info); ret = dog_exec_req(nid, &hdr, &info); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to get multi-disk infomation: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } for (i = 0; i < info.nr; i++) { uint64_t size = info.disk[i].free + info.disk[i].used; int ratio = (int)(((double)info.disk[i].used / size) * 100); if (raw_output) fprintf(stdout, "%s %d %s %s %s %d%% %s\n", addr_to_str(nid->addr, nid->port), info.disk[i].idx, strnumber(size), strnumber(info.disk[i].used), strnumber(info.disk[i].free), ratio, info.disk[i].path); else fprintf(stdout, "%2d\t%s\t%s\t%s\t%3d%%\t%s\n", info.disk[i].idx, strnumber(size), strnumber(info.disk[i].used), strnumber(info.disk[i].free), ratio, info.disk[i].path); } return EXIT_SUCCESS; } static int md_info(int argc, char **argv) { struct sd_node *n; int ret, i = 0; if (!raw_output) fprintf(stdout, "Id\tSize\tUsed\tAvail\tUse%%\tPath\n"); if (!node_cmd_data.all_nodes) return node_md_info(&sd_nid); rb_for_each_entry(n, &sd_nroot, rb) { if (!raw_output) fprintf(stdout, "Node %d:\n", i++); ret = node_md_info(&n->nid); if (ret != EXIT_SUCCESS) return EXIT_FAILURE; } return EXIT_SUCCESS; } static int do_plug_unplug(char *disks, bool plug) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (!disks) return EXIT_USAGE; if (!strlen(disks)) { sd_err("Empty path isn't allowed"); return EXIT_FAILURE; } if (plug) sd_init_req(&hdr, SD_OP_MD_PLUG); else sd_init_req(&hdr, SD_OP_MD_UNPLUG); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(disks) + 1; ret = dog_exec_req(&sd_nid, &hdr, disks); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to execute request, look for sheep.log" " for more information"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int md_plug(int argc, char **argv) { return do_plug_unplug(argv[optind], true); } static int md_unplug(int argc, char **argv) { return do_plug_unplug(argv[optind], false); } static struct subcommand node_md_cmd[] = { {"info", NULL, NULL, "show multi-disk information", NULL, CMD_NEED_NODELIST, md_info}, {"plug", NULL, NULL, "plug more disk(s) into node", NULL, CMD_NEED_ARG, md_plug}, {"unplug", NULL, NULL, "unplug disk(s) from node", NULL, CMD_NEED_ARG, md_unplug}, {NULL}, }; static int node_md(int argc, char **argv) { return do_generic_subcommand(node_md_cmd, argc, argv); } static int node_parser(int ch, const char *opt) { switch (ch) { case 'A': node_cmd_data.all_nodes = true; break; case 'P': node_cmd_data.recovery_progress = true; break; case 'w': node_cmd_data.watch = true; break; case 'l': node_cmd_data.local = true; break; } return 0; } static struct sd_option node_options[] = { {'A', "all", false, "show md information of all the nodes"}, {'P', "progress", false, "show progress of recovery in the node"}, {'w', "watch", false, "watch the stat every second"}, {'l', "local", false, "issue request to local node"}, { 0, NULL, false, NULL }, }; static int node_log_level_set(int argc, char **argv) { int ret = 0; char *loglevel_str = argv[optind]; ret = do_loglevel_set(&sd_nid, loglevel_str); switch (ret) { case EXIT_USAGE: sd_err("invalid loglevel: %s", loglevel_str); sd_err("available loglevels:"); dump_loglevels(true); ret = -1; break; case EXIT_FAILURE: case EXIT_SYSFAIL: sd_err("Failed to execute request"); ret = -1; break; case EXIT_SUCCESS: /* do nothing */ break; default: sd_err("unknown return code of do_loglevel_set(): %d", ret); ret = -1; break; } return ret; } static int node_log_level_get(int argc, char **argv) { int ret = 0, loglevel = -1; ret = do_loglevel_get(&sd_nid, &loglevel); switch (ret) { case EXIT_FAILURE: case EXIT_SYSFAIL: sd_err("Failed to execute request"); ret = -1; break; case EXIT_SUCCESS: sd_info("%s (%d)", loglevel_to_str(loglevel), loglevel); break; default: sd_err("unknown return code of do_loglevel_get(): %d", ret); ret = -1; break; } return ret; } static int node_log_level_list(int argc, char **argv) { dump_loglevels(false); return EXIT_SUCCESS; } static struct subcommand node_log_level_cmd[] = { {"set", "", NULL, "set new loglevel", NULL, CMD_NEED_ARG, node_log_level_set}, {"get", NULL, NULL, "get current loglevel", NULL, 0, node_log_level_get}, {"list", NULL, NULL, "list available loglevels", NULL, 0, node_log_level_list}, {NULL}, }; static int node_log_level(int argc, char **argv) { return do_generic_subcommand(node_log_level_cmd, argc, argv); } static struct subcommand node_log_cmd[] = { {"level", "", NULL, "manipulate loglevel", node_log_level_cmd, CMD_NEED_ARG, node_log_level}, {NULL}, }; static int node_log(int argc, char **argv) { return do_generic_subcommand(node_log_cmd, argc, argv); } static struct subcommand node_cmd[] = { {"kill", "", "aprhl", "kill node", NULL, CMD_NEED_NODELIST, node_kill, node_options}, {"list", NULL, "aprh", "list nodes", NULL, CMD_NEED_NODELIST, node_list}, {"info", NULL, "aprh", "show information about each node", NULL, CMD_NEED_NODELIST, node_info}, {"recovery", NULL, "aphPr", "show recovery information of nodes", NULL, CMD_NEED_NODELIST, node_recovery, node_options}, {"md", "[disks]", "aprAh", "See 'dog node md' for more information", node_md_cmd, CMD_NEED_ARG, node_md, node_options}, {"stat", NULL, "aprwh", "show stat information about the node", NULL, 0, node_stat, node_options}, {"log", NULL, "aph", "show or set log level of the node", node_log_cmd, CMD_NEED_ARG, node_log}, {NULL,}, }; struct command node_command = { "node", node_cmd, node_parser }; sheepdog-0.8.3/dog/trace.c000066400000000000000000000205501237656255000153500ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "dog.h" #include "rbtree.h" #include "list.h" #include "internal_proto.h" static inline void print_thread_name(struct trace_graph_item *item) { printf("%-*s|", TRACE_THREAD_LEN, item->tname); } static inline void print_time(struct trace_graph_item *item) { if (item->type == TRACE_GRAPH_RETURN) { unsigned duration = item->return_time - item->entry_time; unsigned quot = duration / 1000, rem = duration % 1000; printf("%8u.%-3u|", quot, rem); } else if (item->type == TRACE_GRAPH_ENTRY) { printf(" |"); } } static inline void print_finale(struct trace_graph_item *item) { int i; for (i = 0; i < item->depth; i++) printf(" "); if (item->type == TRACE_GRAPH_ENTRY) printf("%s() {\n", item->fname); else printf("}\n"); } static void print_trace_item(struct trace_graph_item *item) { print_thread_name(item); print_time(item); print_finale(item); } static void cat_trace_file(void *buf, size_t size) { struct trace_graph_item *item = (struct trace_graph_item *)buf; size_t sz = size / sizeof(struct trace_graph_item), i; printf(" Thread Name | Time(us) | Function Graph\n"); printf("--------------------------------------------------\n"); for (i = 0; i < sz; i++) print_trace_item(item++); return; } static const char *tracefile = "/tmp/tracefile"; static int trace_read_buffer(void) { int ret, tfd; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; #define TRACE_BUF_LEN (1024 * 1024 * 20) char *buf = xmalloc(TRACE_BUF_LEN); tfd = open(tracefile, O_CREAT | O_RDWR | O_APPEND | O_TRUNC, 0644); if (tfd < 0) { sd_err("can't create tracefile"); return EXIT_SYSFAIL; } read_buffer: sd_init_req(&hdr, SD_OP_TRACE_READ_BUF); hdr.data_length = TRACE_BUF_LEN; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result == SD_RES_AGAIN) goto read_buffer; if (rsp->result != SD_RES_SUCCESS) { sd_err("Trace failed: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } xwrite(tfd, buf, rsp->data_length); if (rsp->data_length == TRACE_BUF_LEN) goto read_buffer; free(buf); return EXIT_SUCCESS; } static int trace_enable(int argc, char **argv) { const char *tracer = argv[optind]; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_ENABLE); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(tracer) + 1; ret = dog_exec_req(&sd_nid, &hdr, (void *)tracer); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { case SD_RES_SUCCESS: break; case SD_RES_NO_SUPPORT: sd_err("no such tracer %s", tracer); return EXIT_FAILURE; case SD_RES_INVALID_PARMS: sd_err("tracer %s is already enabled", tracer); return EXIT_FAILURE; default: sd_err("unknown error (%s)", sd_strerror(rsp->result)); return EXIT_SYSFAIL; } return EXIT_SUCCESS; } static int trace_disable(int argc, char **argv) { const char *tracer = argv[optind]; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_DISABLE); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(tracer) + 1; ret = dog_exec_req(&sd_nid, &hdr, (void *)tracer); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { case SD_RES_SUCCESS: break; case SD_RES_NO_SUPPORT: sd_err("no such tracer %s", tracer); return EXIT_FAILURE; case SD_RES_INVALID_PARMS: sd_err("tracer %s is not enabled", tracer); return EXIT_FAILURE; default: sd_err("unknown error (%s)", sd_strerror(rsp->result)); return EXIT_SYSFAIL; } return trace_read_buffer(); } static int trace_status(int argc, char **argv) { char buf[4096]; /* must have enough space to store tracer list */ int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_STATUS); hdr.data_length = sizeof(buf); ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } printf("%s", buf); return EXIT_SUCCESS; } static void *map_trace_file(struct stat *st) { int fd = open(tracefile, O_RDONLY); void *map; if (fd < 0) { sd_err("%m"); return NULL; } if (fstat(fd, st) < 0) { sd_err("%m"); close(fd); return NULL; } if (st->st_size == 0) { sd_err("trace file is empty"); return NULL; } map = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (map == MAP_FAILED) { sd_err("%m"); return NULL; } return map; } static int graph_cat(int argc, char **argv) { struct stat st; void *map = map_trace_file(&st); if (!map) return EXIT_FAILURE; cat_trace_file(map, st.st_size); munmap(map, st.st_size); return EXIT_SUCCESS; } struct graph_stat_entry { struct rb_node rb; struct list_node list; char fname[TRACE_FNAME_LEN]; uint64_t duration; uint16_t nr_calls; }; static struct rb_root stat_tree_root; static LIST_HEAD(stat_list); static int graph_stat_cmp(const struct graph_stat_entry *a, const struct graph_stat_entry *b) { return strcmp(a->fname, b->fname); } static struct graph_stat_entry * stat_tree_insert(struct graph_stat_entry *new) { struct graph_stat_entry *entry; entry = rb_insert(&stat_tree_root, new, rb, graph_stat_cmp); if (entry) { entry->duration += new->duration; entry->nr_calls++; } return entry; } static void prepare_stat_tree(struct trace_graph_item *item) { struct graph_stat_entry *new; if (item->type != TRACE_GRAPH_RETURN) return; new = xmalloc(sizeof(*new)); pstrcpy(new->fname, sizeof(new->fname), item->fname); new->duration = item->return_time - item->entry_time; new->nr_calls = 1; if (stat_tree_insert(new)) { free(new); return; } list_add(&new->list, &stat_list); } static void stat_list_print(void) { struct graph_stat_entry *entry; list_for_each_entry(entry, &stat_list, list) { float total = (float)entry->duration / 1000000000; float per = (float)entry->duration / entry->nr_calls / 1000000; printf("%10.3f %10.3f %5"PRIu16" %-*s\n", total, per, entry->nr_calls, TRACE_FNAME_LEN, entry->fname); } } static int stat_list_cmp(void *priv, struct list_node *a, struct list_node *b) { struct graph_stat_entry *ga = container_of(a, struct graph_stat_entry, list); struct graph_stat_entry *gb = container_of(b, struct graph_stat_entry, list); /* '-' is for reverse sort, largest first */ return -intcmp(ga->duration, gb->duration); } static void stat_trace_file(void *buf, size_t size) { struct trace_graph_item *item = (struct trace_graph_item *)buf; size_t sz = size / sizeof(struct trace_graph_item), i; printf(" Total (s) Per Call (ms) Calls Name\n"); for (i = 0; i < sz; i++) prepare_stat_tree(item++); list_sort(NULL, &stat_list, stat_list_cmp); stat_list_print(); } static int graph_stat(int argc, char **argv) { struct stat st; void *map = map_trace_file(&st); if (!map) return EXIT_FAILURE; stat_trace_file(map, st.st_size); munmap(map, st.st_size); return EXIT_SUCCESS; } static int trace_parser(int ch, const char *opt) { return 0; } static struct subcommand graph_cmd[] = { {"cat", NULL, NULL, "cat the output of graph tracer", NULL, 0, graph_cat}, {"stat", NULL, NULL, "get the stat of the graph calls", NULL, 0, graph_stat}, {NULL,}, }; static int trace_graph(int argc, char **argv) { return do_generic_subcommand(graph_cmd, argc, argv); } /* Subcommand list of trace */ static struct subcommand trace_cmd[] = { {"enable", "", "aph", "enable tracer", NULL, CMD_NEED_ARG, trace_enable}, {"disable", "", "aph", "disable tracer", NULL, CMD_NEED_ARG, trace_disable}, {"status", NULL, "aph", "show tracer statuses", NULL, 0, trace_status}, {"graph", NULL, "aph", "run dog trace graph for more information", graph_cmd, CMD_NEED_ARG, trace_graph}, {NULL}, }; struct command trace_command = { "trace", trace_cmd, trace_parser }; sheepdog-0.8.3/dog/treeview.c000066400000000000000000000074551237656255000161150ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "util.h" #include "treeview.h" #ifndef MAX_DEPTH #define MAX_DEPTH 100 #endif struct vdi_tree { char name[1024]; char label[256]; uint32_t vid; uint32_t pvid; bool highlight; struct list_head children; struct list_node siblings; }; static int *width, *more; static struct vdi_tree *root; static struct vdi_tree *find_vdi(struct vdi_tree *parent, uint32_t vid, const char *name) { struct vdi_tree *vdi, *ret; list_for_each_entry(vdi, &parent->children, siblings) { if (vdi->vid == vid && !strcmp(vdi->name, name)) return vdi; ret = find_vdi(vdi, vid, name); if (ret) return ret; } return NULL; } static struct vdi_tree *new_vdi(const char *name, const char *label, uint64_t vid, uint64_t pvid, bool highlight) { struct vdi_tree *vdi; vdi = xmalloc(sizeof(struct vdi_tree)); pstrcpy(vdi->name, sizeof(vdi->name), name); pstrcpy(vdi->label, sizeof(vdi->label), label); vdi->vid = vid; vdi->pvid = pvid; vdi->highlight = highlight; INIT_LIST_HEAD(&vdi->children); return vdi; } void init_tree(void) { root = new_vdi("", "", 0, 0, 0); } void add_vdi_tree(const char *name, const char *label, uint32_t vid, uint32_t pvid, bool highlight) { struct vdi_tree *vdi, *parent; vdi = new_vdi(name, label, vid, pvid, highlight); if (!vdi) return; parent = find_vdi(root, pvid, name); if (!parent) parent = root; list_add_tail(&vdi->siblings, &parent->children); } static void compaction(struct vdi_tree *parent) { struct vdi_tree *vdi, *new_parent; list_for_each_entry(vdi, &parent->children, siblings) { new_parent = find_vdi(root, vdi->pvid, vdi->name); if (new_parent && parent != new_parent) list_move_tail(&vdi->siblings, &new_parent->children); compaction(vdi); } } static int get_depth(struct vdi_tree *parent) { struct vdi_tree *vdi; int max_depth = 0, depth; list_for_each_entry(vdi, &parent->children, siblings) { depth = get_depth(vdi); if (max_depth < depth) max_depth = depth; } return max_depth + 1; } static void spaces(int n) { while (n--) putchar(' '); } static void indent(int level, bool first, bool last) { int lvl; if (first) printf(last ? "---" : "-+-"); else { for (lvl = 0; lvl < level - 1; lvl++) { spaces(width[lvl] + 1); printf(more[lvl + 1] ? "| " : " "); } spaces(width[level - 1] + 1); printf(last ? "`-" : "|-"); } } static void _dump_tree(struct vdi_tree *current, int level, bool first, bool last) { struct vdi_tree *vdi; indent(level, first, last); if (current->highlight) printf(TEXT_BOLD); printf("%s", current->label); if (current->highlight) printf(TEXT_NORMAL); if (list_empty(¤t->children)) { putchar('\n'); return; } more[level] = !last; width[level] = strlen(current->label); list_for_each_entry(vdi, ¤t->children, siblings) { _dump_tree(vdi, level + 1, &vdi->siblings == current->children.n.next, vdi->siblings.next == ¤t->children.n); } } void dump_tree(void) { struct vdi_tree *vdi; int depth; compaction(root); depth = get_depth(root); width = malloc(sizeof(int) * depth); more = malloc(sizeof(int) * depth); if (!width || !more) { sd_err("Failed to allocate memory"); return; } list_for_each_entry(vdi, &root->children, siblings) { printf("%s", vdi->name); more[0] = 0; width[0] = strlen(vdi->name); _dump_tree(vdi, 1, true, true); } } sheepdog-0.8.3/dog/treeview.h000066400000000000000000000011711237656255000161070ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __TREEVIEW__ #define __TREEVIEW__ #include void init_tree(void); void add_vdi_tree(const char *label, const char *tag, uint32_t vid, uint32_t pvid, bool highlight); void dump_tree(void); #endif sheepdog-0.8.3/dog/vdi.c000066400000000000000000001755121237656255000150450ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "dog.h" #include "treeview.h" #include "sha1.h" #include "fec.h" static struct sd_option vdi_options[] = { {'P', "prealloc", false, "preallocate all the data objects"}, {'i', "index", true, "specify the index of data objects"}, {'s', "snapshot", true, "specify a snapshot id or tag name"}, {'x', "exclusive", false, "write in an exclusive mode"}, {'d', "delete", false, "delete a key"}, {'w', "writeback", false, "use writeback mode"}, {'c', "copies", true, "specify the data redundancy level"}, {'F', "from", true, "create a differential backup from the snapshot"}, {'f', "force", false, "do operation forcibly"}, {'y', "hyper", false, "create a hyper volume"}, {'o', "oid", true, "specify the object id of the tracking object"}, { 0, NULL, false, NULL }, }; static struct vdi_cmd_data { unsigned int index; int snapshot_id; char snapshot_tag[SD_MAX_VDI_TAG_LEN]; bool exclusive; bool delete; bool prealloc; int nr_copies; bool writeback; int from_snapshot_id; char from_snapshot_tag[SD_MAX_VDI_TAG_LEN]; bool force; uint8_t copy_policy; uint8_t store_policy; uint64_t oid; } vdi_cmd_data = { ~0, }; struct get_vdi_info { const char *name; const char *tag; uint32_t vid; uint32_t snapid; uint8_t nr_copies; uint8_t copy_policy; }; int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len, uint64_t offset, uint32_t flags, int copies, int copy_policy, bool create, bool direct) { return dog_write_object(oid, 0, mem, len, offset, flags, copies, copy_policy, create, direct); } int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len, uint64_t offset) { return dog_read_object(oid, *mem, len, offset, true); } static inline bool is_data_obj_writeable(const struct sd_inode *inode, uint32_t idx) { return inode->vdi_id == INODE_GET_VID(inode, idx); } static void vdi_show_progress(uint64_t done, uint64_t total) { return show_progress(done, total, false); } struct stat_arg { uint64_t *my; uint64_t *cow; uint32_t vid; }; static void stat_cb(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct stat_arg *sarg = arg; uint64_t *my = sarg->my; uint64_t *cow = sarg->cow; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (ext->vdi_id == sarg->vid) (*my)++; else if (ext->vdi_id != 0) (*cow)++; } } static void stat_data_objs_btree(const struct sd_inode *inode, uint64_t *my_objs, uint64_t *cow_objs) { struct stat_arg arg = {my_objs, cow_objs, inode->vdi_id}; traverse_btree(dog_bnode_reader, inode, stat_cb, &arg); } static void stat_data_objs_array(const struct sd_inode *inode, uint64_t *my_objs, uint64_t *cow_objs) { int nr; uint64_t my, cow, *p; uint32_t vid = inode->vdi_id; my = 0; cow = 0; nr = count_data_objs(inode); if (nr % 2 != 0) { if (is_data_obj_writeable(inode, 0)) my++; else if (inode->data_vdi_id[0] != 0) cow++; p = (uint64_t *)(inode->data_vdi_id + 1); } else p = (uint64_t *)inode->data_vdi_id; /* * To boost performance, this function checks data_vdi_id for each 64 * bit integer. */ nr /= 2; for (int i = 0; i < nr; i++) { if (p[i] == 0) continue; if (p[i] == (((uint64_t)vid << 32) | vid)) { my += 2; continue; } /* Check the higher 32 bit */ if (p[i] >> 32 == vid) my++; else if ((p[i] & 0xFFFFFFFF00000000) != 0) cow++; /* Check the lower 32 bit */ if ((p[i] & 0xFFFFFFFF) == vid) my++; else if ((p[i] & 0xFFFFFFFF) != 0) cow++; } *my_objs = my; *cow_objs = cow; } /* * Get the number of objects. * * 'my_objs' means the number objects which belongs to this vdi. 'cow_objs' * means the number of the other objects. */ static void stat_data_objs(const struct sd_inode *inode, uint64_t *my_objs, uint64_t *cow_objs) { if (inode->store_policy == 0) stat_data_objs_array(inode, my_objs, cow_objs); else stat_data_objs_btree(inode, my_objs, cow_objs); } static char *redundancy_scheme(uint8_t copy_nr, uint8_t policy) { static char str[10]; if (policy > 0) { int d, p; ec_policy_to_dp(policy, &d, &p); snprintf(str, sizeof(str), "%d:%d", d, p); } else { snprintf(str, sizeof(str), "%d", copy_nr); } return str; } static void print_vdi_list(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { bool is_clone = false; uint64_t my_objs = 0, cow_objs = 0; time_t ti; struct tm tm; char dbuf[128]; struct get_vdi_info *info = data; if (info && strcmp(name, info->name) != 0) return; ti = i->create_time >> 32; if (raw_output) { snprintf(dbuf, sizeof(dbuf), "%" PRIu64, (uint64_t) ti); } else { localtime_r(&ti, &tm); strftime(dbuf, sizeof(dbuf), "%Y-%m-%d %H:%M", &tm); } stat_data_objs(i, &my_objs, &cow_objs); if (i->snap_id == 1 && i->parent_vdi_id != 0) is_clone = true; if (raw_output) { printf("%c ", vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : '=')); while (*name) { if (isspace(*name) || *name == '\\') putchar('\\'); putchar(*name++); } printf(" %d %s %s %s %s %" PRIx32 " %s %s\n", snapid, strnumber(i->vdi_size), strnumber(my_objs * SD_DATA_OBJ_SIZE), strnumber(cow_objs * SD_DATA_OBJ_SIZE), dbuf, vid, redundancy_scheme(i->nr_copies, i->copy_policy), i->tag); } else { printf("%c %-8s %5d %7s %7s %7s %s %7" PRIx32 " %6s %13s\n", vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : ' '), name, snapid, strnumber(i->vdi_size), strnumber(my_objs * SD_DATA_OBJ_SIZE), strnumber(cow_objs * SD_DATA_OBJ_SIZE), dbuf, vid, redundancy_scheme(i->nr_copies, i->copy_policy), i->tag); } } static void print_vdi_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { time_t ti; struct tm tm; char buf[128]; if (vdi_is_snapshot(i)) { ti = i->create_time >> 32; localtime_r(&ti, &tm); strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M]", &tm); } else pstrcpy(buf, sizeof(buf), "(you are here)"); add_vdi_tree(name, buf, vid, i->parent_vdi_id, highlight && !vdi_is_snapshot(i)); } static void print_vdi_graph(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { time_t ti; struct tm tm; char dbuf[128], tbuf[128]; ti = i->create_time >> 32; localtime_r(&ti, &tm); strftime(dbuf, sizeof(dbuf), "%Y-%m-%d", &tm); strftime(tbuf, sizeof(tbuf), "%H:%M:%S", &tm); printf(" \"%x\" -> \"%x\";\n", i->parent_vdi_id, vid); printf(" \"%x\" [\n" " group = \"%s\",\n" " label = \"", vid, name); printf("Name: %10s\\n" "Tag: %10x\\n" "Size: %10s\\n" "Date: %10s\\n" "Time: %10s", name, snapid, strnumber(i->vdi_size), dbuf, tbuf); if (vdi_is_snapshot(i)) printf("\"\n ];\n\n"); else printf("\",\n color=\"red\"\n ];\n\n"); } static void vdi_info_filler(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { struct get_vdi_info *info = data; if (info->name) { if (info->tag && info->tag[0]) { if (!strcmp(name, info->name) && !strcmp(tag, info->tag)) { info->vid = vid; info->nr_copies = i->nr_copies; info->copy_policy = i->copy_policy; } } else if (info->snapid) { if (!strcmp(name, info->name) && snapid == info->snapid) { info->vid = vid; info->nr_copies = i->nr_copies; info->copy_policy = i->copy_policy; } } else { if (!strcmp(name, info->name)) { info->vid = vid; info->nr_copies = i->nr_copies; info->copy_policy = i->copy_policy; } } } } typedef int (*obj_parser_func_t)(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data); static int do_print_obj(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data) { switch (rsp->result) { case SD_RES_SUCCESS: printf("%s has the object\n", sheep); break; case SD_RES_NO_OBJ: printf("%s doesn't have the object\n", sheep); break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: sd_err("The node list has changed: please try again"); break; default: sd_err("%s: hit an unexpected error (%s)", sheep, sd_strerror(rsp->result)); break; } return 0; } struct obj_info_filler_info { bool success; uint64_t data_oid; unsigned idx; }; static int obj_info_filler(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data) { struct obj_info_filler_info *info = data; struct sd_inode *inode = (struct sd_inode *)buf; uint32_t vdi_id; switch (rsp->result) { case SD_RES_SUCCESS: if (info->success) break; info->success = true; vdi_id = INODE_GET_VID(inode, info->idx); if (vdi_id) { info->data_oid = vid_to_data_oid(vdi_id, info->idx); return 1; } break; case SD_RES_NO_OBJ: break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: sd_err("The node list has changed: please try again"); break; default: sd_err("%s: hit an unexpected error (%s)", sheep, sd_strerror(rsp->result)); break; } return 0; } static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, size_t size) { int ret, cb_ret; struct sd_node *n; char *buf; buf = xzalloc(size); rb_for_each_entry(n, &sd_nroot, rb) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_READ_PEER); hdr.data_length = size; hdr.flags = 0; hdr.epoch = sd_epoch; hdr.obj.oid = oid; hdr.obj.ec_index = SD_MAX_COPIES + 1; /* Ignore index */ ret = dog_exec_req(&n->nid, &hdr, buf); if (ret < 0) continue; switch (rsp->result) { sd_err("%s", sd_strerror(rsp->result)); continue; } cb_ret = func(addr_to_str(n->nid.addr, n->nid.port), oid, rsp, buf, data); if (cb_ret) break; } free(buf); } static int vdi_list(int argc, char **argv) { const char *vdiname = argv[optind]; if (!raw_output) printf(" Name Id Size Used Shared Creation time VDI id Copies Tag\n"); if (vdiname) { struct get_vdi_info info; memset(&info, 0, sizeof(info)); info.name = vdiname; if (parse_vdi(print_vdi_list, SD_INODE_SIZE, &info) < 0) return EXIT_SYSFAIL; return EXIT_SUCCESS; } else { if (parse_vdi(print_vdi_list, SD_INODE_SIZE, NULL) < 0) return EXIT_SYSFAIL; return EXIT_SUCCESS; } } static int vdi_tree(int argc, char **argv) { init_tree(); if (parse_vdi(print_vdi_tree, SD_INODE_HEADER_SIZE, NULL) < 0) return EXIT_SYSFAIL; dump_tree(); return EXIT_SUCCESS; } static int vdi_graph(int argc, char **argv) { /* print a header */ printf("digraph G {\n"); printf(" node [shape = \"box\", fontname = \"Courier\"];\n\n"); printf(" \"0\" [shape = \"ellipse\", label = \"root\"];\n\n"); if (parse_vdi(print_vdi_graph, SD_INODE_HEADER_SIZE, NULL) < 0) return EXIT_SYSFAIL; /* print a footer */ printf("}\n"); return EXIT_SUCCESS; } static int find_vdi_name(const char *vdiname, uint32_t snapid, const char *tag, uint32_t *vid, int for_snapshot) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; memset(buf, 0, sizeof(buf)); pstrcpy(buf, SD_MAX_VDI_LEN, vdiname); if (tag) pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, tag); if (for_snapshot) sd_init_req(&hdr, SD_OP_GET_VDI_INFO); else sd_init_req(&hdr, SD_OP_LOCK_VDI); hdr.data_length = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; hdr.flags = SD_FLAG_CMD_WRITE; hdr.vdi.snapid = snapid; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) return -1; if (rsp->result != SD_RES_SUCCESS) { sd_err("Cannot get VDI info for %s %d %s: %s", vdiname, snapid, tag, sd_strerror(rsp->result)); return -1; } *vid = rsp->vdi.vdi_id; return 0; } static int read_vdi_obj(const char *vdiname, int snapid, const char *tag, uint32_t *pvid, struct sd_inode *inode, size_t size) { int ret; uint32_t vid; ret = find_vdi_name(vdiname, snapid, tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); return EXIT_FAILURE; } ret = dog_read_object(vid_to_vdi_oid(vid), inode, size, 0, true); if (ret != SD_RES_SUCCESS) { if (snapid) { sd_err("Failed to read a snapshot %s:%d", vdiname, snapid); } else if (tag && tag[0]) { sd_err("Failed to read a snapshot %s:%s", vdiname, tag); } else { sd_err("Failed to read a vdi %s", vdiname); } return EXIT_FAILURE; } if (pvid) *pvid = vid; return EXIT_SUCCESS; } int do_vdi_create(const char *vdiname, int64_t vdi_size, uint32_t base_vid, uint32_t *vdi_id, bool snapshot, uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; char buf[SD_MAX_VDI_LEN]; memset(buf, 0, sizeof(buf)); pstrcpy(buf, SD_MAX_VDI_LEN, vdiname); sd_init_req(&hdr, SD_OP_NEW_VDI); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = SD_MAX_VDI_LEN; hdr.vdi.base_vdi_id = base_vid; hdr.vdi.snapid = snapshot ? 1 : 0; hdr.vdi.vdi_size = vdi_size; hdr.vdi.copies = nr_copies; hdr.vdi.copy_policy = copy_policy; hdr.vdi.store_policy = store_policy; ret = dog_exec_req(&sd_nid, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to create VDI %s: %s", vdiname, sd_strerror(rsp->result)); return EXIT_FAILURE; } if (vdi_id) *vdi_id = rsp->vdi.vdi_id; return EXIT_SUCCESS; } static int vdi_create(int argc, char **argv) { const char *vdiname = argv[optind++]; uint64_t size; uint32_t vid; uint64_t oid; uint32_t idx, max_idx; struct sd_inode *inode = NULL; int ret; if (!argv[optind]) { sd_err("Please specify the VDI size"); return EXIT_USAGE; } ret = option_parse_size(argv[optind], &size); if (ret < 0) return EXIT_USAGE; if (size > SD_OLD_MAX_VDI_SIZE && 0 == vdi_cmd_data.store_policy) { sd_err("VDI size is larger than %s bytes, please use '-y' to " "create a hyper volume with size up to %s bytes", strnumber(SD_OLD_MAX_VDI_SIZE), strnumber(SD_MAX_VDI_SIZE)); return EXIT_USAGE; } if (size > SD_MAX_VDI_SIZE) { sd_err("VDI size is too large"); return EXIT_USAGE; } ret = do_vdi_create(vdiname, size, 0, &vid, false, vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy, vdi_cmd_data.store_policy); if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc) goto out; inode = xmalloc(sizeof(*inode)); ret = dog_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read a newly created VDI object"); ret = EXIT_FAILURE; goto out; } max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE); for (idx = 0; idx < max_idx; idx++) { vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); oid = vid_to_data_oid(vid, idx); ret = dog_write_object(oid, 0, NULL, 0, 0, 0, inode->nr_copies, inode->copy_policy, true, true); if (ret != SD_RES_SUCCESS) { ret = EXIT_FAILURE; goto out; } INODE_SET_VID(inode, idx, vid); ret = sd_inode_write_vid(dog_bnode_writer, inode, idx, vid, vid, 0, false, true); if (ret) { ret = EXIT_FAILURE; goto out; } } vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); ret = EXIT_SUCCESS; out: if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", vid); else printf("VDI ID of newly created VDI: %x\n", vid); } free(inode); return ret; } static int vdi_snapshot(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (vdi_cmd_data.snapshot_id != 0) { sd_err("Please specify a non-integer value for " "a snapshot tag name"); return EXIT_USAGE; } ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; if (inode->store_policy) { sd_err("creating a snapshot of hypervolume is not supported"); return EXIT_FAILURE; } ret = dog_write_object(vid_to_vdi_oid(vid), 0, vdi_cmd_data.snapshot_tag, SD_MAX_VDI_TAG_LEN, offsetof(struct sd_inode, tag), 0, inode->nr_copies, inode->copy_policy, false, false); if (ret != SD_RES_SUCCESS) return EXIT_FAILURE; ret = do_vdi_create(vdiname, inode->vdi_size, vid, NULL, true, inode->nr_copies, inode->copy_policy, inode->store_policy); if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", vid); else printf("VDI ID of newly created snapshot: %x\n", vid); } return ret; } static int vdi_clone(int argc, char **argv) { const char *src_vdi = argv[optind++], *dst_vdi; uint32_t base_vid, new_vid, vdi_id; uint64_t oid; uint32_t idx, max_idx, ret; struct sd_inode *inode = NULL, *new_inode = NULL; char *buf = NULL; dst_vdi = argv[optind]; if (!dst_vdi) { sd_err("Destination VDI name must be specified"); ret = EXIT_USAGE; goto out; } if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("Only snapshot VDIs can be cloned"); sd_err("Please specify the '-s' option"); ret = EXIT_USAGE; goto out; } inode = xmalloc(sizeof(*inode)); ret = read_vdi_obj(src_vdi, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &base_vid, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false, inode->nr_copies, inode->copy_policy, inode->store_policy); if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc) goto out; new_inode = xmalloc(sizeof(*inode)); ret = read_vdi_obj(dst_vdi, 0, "", NULL, new_inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) goto out; buf = xzalloc(SD_DATA_OBJ_SIZE); max_idx = count_data_objs(inode); for (idx = 0; idx < max_idx; idx++) { size_t size; vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); vdi_id = INODE_GET_VID(inode, idx); if (vdi_id) { oid = vid_to_data_oid(vdi_id, idx); ret = dog_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true); if (ret) { ret = EXIT_FAILURE; goto out; } size = SD_DATA_OBJ_SIZE; } else size = 0; oid = vid_to_data_oid(new_vid, idx); ret = dog_write_object(oid, 0, buf, size, 0, 0, inode->nr_copies, inode->copy_policy, true, true); if (ret != SD_RES_SUCCESS) { ret = EXIT_FAILURE; goto out; } INODE_SET_VID(new_inode, idx, new_vid); ret = sd_inode_write_vid(dog_bnode_writer, new_inode, idx, new_vid, new_vid, 0, false, true); if (ret) { ret = EXIT_FAILURE; goto out; } } vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); ret = EXIT_SUCCESS; out: if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", new_vid); else printf("VDI ID of newly created clone: %x\n", new_vid); } free(inode); if (new_inode) free(new_inode); free(buf); return ret; } static int vdi_resize(int argc, char **argv) { const char *vdiname = argv[optind++]; uint64_t new_size; uint32_t vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (!argv[optind]) { sd_err("Please specify the new size for the VDI"); return EXIT_USAGE; } ret = option_parse_size(argv[optind], &new_size); if (ret < 0) return EXIT_USAGE; ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; if (new_size > SD_OLD_MAX_VDI_SIZE && 0 == inode->store_policy) { sd_err("New VDI size is too large"); return EXIT_USAGE; } if (new_size > SD_MAX_VDI_SIZE) { sd_err("New VDI size is too large"); return EXIT_USAGE; } if (new_size < inode->vdi_size) { sd_err("Shrinking VDIs is not implemented"); return EXIT_USAGE; } inode->vdi_size = new_size; ret = dog_write_object(vid_to_vdi_oid(vid), 0, inode, SD_INODE_HEADER_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, false, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to update an inode header"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int do_vdi_delete(const char *vdiname, int snap_id, const char *snap_tag) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char data[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; uint32_t vid; ret = find_vdi_name(vdiname, snap_id, snap_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); return EXIT_FAILURE; } sd_init_req(&hdr, SD_OP_DELETE_CACHE); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&sd_nid, &hdr); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } sd_init_req(&hdr, SD_OP_DEL_VDI); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(data); hdr.vdi.snapid = snap_id; memset(data, 0, sizeof(data)); pstrcpy(data, SD_MAX_VDI_LEN, vdiname); if (snap_tag) pstrcpy(data + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); ret = dog_exec_req(&sd_nid, &hdr, data); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to delete %s: %s", vdiname, sd_strerror(rsp->result)); if (rsp->result == SD_RES_NO_VDI) return EXIT_MISSING; else return EXIT_FAILURE; } return EXIT_SUCCESS; } static int vdi_delete(int argc, char **argv) { const char *vdiname = argv[optind]; return do_vdi_delete(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); } static int vdi_rollback(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t base_vid, new_vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("Please specify the '-s' option"); return EXIT_USAGE; } ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &base_vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; if (!vdi_cmd_data.force) confirm("This operation dicards any changes made since the" " previous\nsnapshot was taken. Continue? [yes/no]: "); ret = do_vdi_delete(vdiname, 0, NULL); if (ret != SD_RES_SUCCESS) { sd_err("Failed to delete the current state"); return EXIT_FAILURE; } ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid, false, vdi_cmd_data.nr_copies, inode->copy_policy, inode->store_policy); if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", new_vid); else printf("New VDI ID of rollbacked VDI: %x\n", new_vid); } return ret; } static int vdi_object(int argc, char **argv) { const char *vdiname = argv[optind]; unsigned idx = vdi_cmd_data.index; struct get_vdi_info info; uint32_t vid; size_t size; memset(&info, 0, sizeof(info)); info.name = vdiname; info.tag = vdi_cmd_data.snapshot_tag; info.vid = 0; info.snapid = vdi_cmd_data.snapshot_id; if (parse_vdi(vdi_info_filler, SD_INODE_HEADER_SIZE, &info) < 0) return EXIT_SYSFAIL; vid = info.vid; if (vid == 0) { sd_err("VDI not found"); return EXIT_MISSING; } if (idx == ~0) { printf("Looking for the inode object 0x%" PRIx32 " with %d nodes\n\n", vid, sd_nodes_nr); parse_objs(vid_to_vdi_oid(vid), do_print_obj, NULL, SD_INODE_SIZE); } else { struct obj_info_filler_info oid_info = {0}; oid_info.success = false; oid_info.idx = idx; if (idx >= MAX_DATA_OBJS) { printf("The offset is too large!\n"); exit(EXIT_FAILURE); } size = get_store_objsize(info.copy_policy, vid_to_data_oid(vid, 0)); parse_objs(vid_to_vdi_oid(vid), obj_info_filler, &oid_info, size); if (oid_info.success) { if (oid_info.data_oid) { printf("Looking for the object 0x%" PRIx64 " (vid 0x%" PRIx32 " idx %u, %u copies) " "with %d nodes\n\n", oid_info.data_oid, vid, idx, info.nr_copies, sd_nodes_nr); parse_objs(oid_info.data_oid, do_print_obj, NULL, size); } else printf("The inode object 0x%" PRIx32 " idx %u is not allocated\n", vid, idx); } else sd_err("Failed to read the inode object 0x%" PRIx32, vid); } return EXIT_SUCCESS; } static int do_track_object(uint64_t oid, uint8_t nr_copies) { int i, j, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; const struct sd_vnode *vnode_buf[SD_MAX_COPIES]; struct epoch_log *logs; int nr_logs, log_length; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; if (rsp->result != SD_RES_SUCCESS) { printf("%s\n", sd_strerror(rsp->result)); goto error; } nr_logs = rsp->data_length / sizeof(struct epoch_log); for (i = nr_logs - 1; i >= 0; i--) { struct rb_root vroot = RB_ROOT; struct rb_root nroot = RB_ROOT; printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n", oid, logs[i].epoch, nr_copies); printf("---------------------------------------------------\n"); /* * When # of nodes is less than nr_copies, we only print * remaining nodes that holds all the remaining copies. */ if (logs[i].nr_nodes < nr_copies) { for (j = 0; j < logs[i].nr_nodes; j++) { const struct node_id *n = &logs[i].nodes[j].nid; printf("%s\n", addr_to_str(n->addr, n->port)); } continue; } for (int k = 0; k < logs[i].nr_nodes; k++) rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp); nodes_to_vnodes(&nroot, &vroot); oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf); for (j = 0; j < nr_copies; j++) { const struct node_id *n = &vnode_buf[j]->node->nid; printf("%s\n", addr_to_str(n->addr, n->port)); } rb_destroy(&vroot, struct sd_vnode, rb); } free(logs); return EXIT_SUCCESS; error: free(logs); return EXIT_SYSFAIL; } static int vdi_track(int argc, char **argv) { const char *vdiname = argv[optind]; unsigned idx = vdi_cmd_data.index; struct get_vdi_info info; struct obj_info_filler_info oid_info = {0}; uint32_t vid; uint8_t nr_copies; uint64_t oid = vdi_cmd_data.oid; memset(&info, 0, sizeof(info)); info.name = vdiname; info.tag = vdi_cmd_data.snapshot_tag; info.vid = 0; info.snapid = vdi_cmd_data.snapshot_id; if (parse_vdi(vdi_info_filler, SD_INODE_HEADER_SIZE, &info) < 0) return EXIT_SYSFAIL; vid = info.vid; nr_copies = info.nr_copies; if (vid == 0) { sd_err("VDI not found"); return EXIT_MISSING; } if (!oid) { if (idx == ~0) { printf("Tracking the inode object 0x%" PRIx32 " with %d nodes\n", vid, sd_nodes_nr); return do_track_object(vid_to_vdi_oid(vid), nr_copies); } oid_info.success = false; oid_info.idx = idx; if (idx >= MAX_DATA_OBJS) { printf("The offset is too large!\n"); goto err; } parse_objs(vid_to_vdi_oid(vid), obj_info_filler, &oid_info, get_store_objsize(info.copy_policy, vid_to_data_oid(vid, 0))); if (!oid_info.success) { sd_err("Failed to read the inode object 0x%" PRIx32, vid); goto err; } if (!oid_info.data_oid) { printf("The inode object 0x%"PRIx32 " idx %u is not allocated\n", vid, idx); goto err; } oid = oid_info.data_oid; printf("Tracking the object 0x%" PRIx64 " (the inode vid 0x%" PRIx32 " idx %u)" " with %d nodes\n", oid, vid, idx, sd_nodes_nr); } else printf("Tracking the object 0x%" PRIx64 " (the inode vid 0x%" PRIx32 ")" " with %d nodes\n", oid, vid, sd_nodes_nr); return do_track_object(oid, nr_copies); err: return EXIT_FAILURE; } static int find_vdi_attr_oid(const char *vdiname, const char *tag, uint32_t snapid, const char *key, void *value, unsigned int value_len, uint32_t *vid, uint64_t *oid, unsigned int *nr_copies, bool create, bool excl, bool delete) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; struct sheepdog_vdi_attr vattr; memset(&vattr, 0, sizeof(vattr)); pstrcpy(vattr.name, SD_MAX_VDI_LEN, vdiname); pstrcpy(vattr.tag, SD_MAX_VDI_TAG_LEN, vdi_cmd_data.snapshot_tag); vattr.snap_id = vdi_cmd_data.snapshot_id; pstrcpy(vattr.key, SD_MAX_VDI_ATTR_KEY_LEN, key); if (value && value_len) { vattr.value_len = value_len; memcpy(vattr.value, value, value_len); } sd_init_req(&hdr, SD_OP_GET_VDI_ATTR); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = SD_ATTR_OBJ_SIZE; hdr.vdi.snapid = vdi_cmd_data.snapshot_id; if (create) hdr.flags |= SD_FLAG_CMD_CREAT; if (excl) hdr.flags |= SD_FLAG_CMD_EXCL; if (delete) hdr.flags |= SD_FLAG_CMD_DEL; ret = dog_exec_req(&sd_nid, &hdr, &vattr); if (ret < 0) return SD_RES_EIO; if (rsp->result != SD_RES_SUCCESS) return rsp->result; *vid = rsp->vdi.vdi_id; *oid = vid_to_attr_oid(rsp->vdi.vdi_id, rsp->vdi.attr_id); *nr_copies = rsp->vdi.copies; return SD_RES_SUCCESS; } static int vdi_setattr(int argc, char **argv) { int ret, value_len = 0; uint64_t attr_oid = 0; uint32_t vid = 0, nr_copies = 0; const char *vdiname = argv[optind++], *key; char *value; uint64_t offset; key = argv[optind++]; if (!key) { sd_err("Please specify the attribute key"); return EXIT_USAGE; } value = argv[optind++]; if (!value && !vdi_cmd_data.delete) { value = xmalloc(SD_MAX_VDI_ATTR_VALUE_LEN); offset = 0; reread: ret = read(STDIN_FILENO, value + offset, SD_MAX_VDI_ATTR_VALUE_LEN - offset); if (ret < 0) { sd_err("Failed to read attribute value from stdin: %m"); return EXIT_SYSFAIL; } if (ret > 0) { offset += ret; goto reread; } } if (value) value_len = strlen(value); ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag, vdi_cmd_data.snapshot_id, key, value, value_len, &vid, &attr_oid, &nr_copies, !vdi_cmd_data.delete, vdi_cmd_data.exclusive, vdi_cmd_data.delete); if (ret) { if (ret == SD_RES_VDI_EXIST) { sd_err("The attribute '%s' already exists", key); return EXIT_EXISTS; } else if (ret == SD_RES_NO_OBJ) { sd_err("Attribute '%s' not found", key); return EXIT_MISSING; } else if (ret == SD_RES_NO_VDI) { sd_err("VDI not found"); return EXIT_MISSING; } else sd_err("Failed to set attribute: %s", sd_strerror(ret)); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int vdi_getattr(int argc, char **argv) { int ret; uint64_t oid, attr_oid = 0; uint32_t vid = 0, nr_copies = 0; const char *vdiname = argv[optind++], *key; struct sheepdog_vdi_attr vattr; key = argv[optind++]; if (!key) { sd_err("Please specify the attribute key"); return EXIT_USAGE; } ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag, vdi_cmd_data.snapshot_id, key, NULL, 0, &vid, &attr_oid, &nr_copies, false, false, false); if (ret == SD_RES_NO_OBJ) { sd_err("Attribute '%s' not found", key); return EXIT_MISSING; } else if (ret == SD_RES_NO_VDI) { sd_err("VDI not found"); return EXIT_MISSING; } else if (ret) { sd_err("Failed to find attribute oid: %s", sd_strerror(ret)); return EXIT_MISSING; } oid = attr_oid; ret = dog_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read attribute oid: %s", sd_strerror(ret)); return EXIT_SYSFAIL; } xwrite(STDOUT_FILENO, vattr.value, vattr.value_len); return EXIT_SUCCESS; } static int vdi_read(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret; struct sd_inode *inode = NULL; uint64_t offset = 0, oid, done = 0, total = (uint64_t) -1; uint32_t vdi_id, idx; unsigned int len; char *buf = NULL; if (argv[optind]) { ret = option_parse_size(argv[optind++], &offset); if (ret < 0) return EXIT_USAGE; if (argv[optind]) { ret = option_parse_size(argv[optind++], &total); if (ret < 0) return EXIT_USAGE; } } inode = malloc(sizeof(*inode)); buf = xmalloc(SD_DATA_OBJ_SIZE); ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; if (inode->vdi_size < offset) { sd_err("Read offset is beyond the end of the VDI"); ret = EXIT_FAILURE; goto out; } total = min(total, inode->vdi_size - offset); idx = offset / SD_DATA_OBJ_SIZE; offset %= SD_DATA_OBJ_SIZE; while (done < total) { len = min(total - done, SD_DATA_OBJ_SIZE - offset); vdi_id = INODE_GET_VID(inode, idx); if (vdi_id) { oid = vid_to_data_oid(vdi_id, idx); ret = dog_read_object(oid, buf, len, offset, false); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read VDI"); ret = EXIT_FAILURE; goto out; } } else memset(buf, 0, len); ret = xwrite(STDOUT_FILENO, buf, len); if (ret < 0) { sd_err("Failed to write to stdout: %m"); ret = EXIT_SYSFAIL; goto out; } offset = 0; idx++; done += len; } fsync(STDOUT_FILENO); ret = EXIT_SUCCESS; out: free(inode); free(buf); return ret; } static int vdi_write(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t vid, flags, vdi_id, idx; int ret; struct sd_inode *inode = NULL; uint64_t offset = 0, oid, old_oid, done = 0, total = (uint64_t) -1; unsigned int len; char *buf = NULL; bool create; if (argv[optind]) { ret = option_parse_size(argv[optind++], &offset); if (ret < 0) return EXIT_USAGE; if (argv[optind]) { ret = option_parse_size(argv[optind++], &total); if (ret < 0) return EXIT_USAGE; } } inode = xmalloc(sizeof(*inode)); buf = xmalloc(SD_DATA_OBJ_SIZE); ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; if (inode->vdi_size < offset) { sd_err("Write offset is beyond the end of the VDI"); ret = EXIT_FAILURE; goto out; } total = min(total, inode->vdi_size - offset); idx = offset / SD_DATA_OBJ_SIZE; offset %= SD_DATA_OBJ_SIZE; while (done < total) { create = false; old_oid = 0; flags = 0; len = min(total - done, SD_DATA_OBJ_SIZE - offset); vdi_id = INODE_GET_VID(inode, idx); if (!vdi_id) create = true; else if (!is_data_obj_writeable(inode, idx)) { create = true; old_oid = vid_to_data_oid(vdi_id, idx); } if (vdi_cmd_data.writeback) flags |= SD_FLAG_CMD_CACHE; ret = xread(STDIN_FILENO, buf, len); if (ret < 0) { sd_err("Failed to read from stdin: %m"); ret = EXIT_SYSFAIL; goto out; } else if (ret < len) { /* exit after this buffer is sent */ memset(buf + ret, 0, len - ret); total = done + len; } INODE_SET_VID(inode, idx, inode->vdi_id); oid = vid_to_data_oid(inode->vdi_id, idx); ret = dog_write_object(oid, old_oid, buf, len, offset, flags, inode->nr_copies, inode->copy_policy, create, false); if (ret != SD_RES_SUCCESS) { sd_err("Failed to write VDI"); ret = EXIT_FAILURE; goto out; } if (create) { ret = sd_inode_write_vid(dog_bnode_writer, inode, idx, vid, vid, flags, false, false); if (ret) { ret = EXIT_FAILURE; goto out; } } offset += len; if (offset == SD_DATA_OBJ_SIZE) { offset = 0; idx++; } done += len; } ret = EXIT_SUCCESS; out: free(inode); free(buf); return ret; } static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; void *buf; size_t size = get_objsize(oid); buf = xmalloc(size); sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = sd_epoch; hdr.flags = 0; hdr.data_length = size; hdr.obj.oid = oid; ret = dog_exec_req(&vnode->node->nid, &hdr, buf); if (ret < 0) exit(EXIT_SYSFAIL); switch (rsp->result) { case SD_RES_SUCCESS: break; case SD_RES_NO_OBJ: free(buf); return NULL; default: sd_err("FATAL: failed to read %"PRIx64", %s", oid, sd_strerror(rsp->result)); exit(EXIT_FAILURE); } return buf; } static void write_object_to(const struct sd_vnode *vnode, uint64_t oid, void *buf, bool create, uint8_t ec_index) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_PEER); else sd_init_req(&hdr, SD_OP_WRITE_PEER); hdr.epoch = sd_epoch; hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = get_objsize(oid); hdr.obj.oid = oid; hdr.obj.ec_index = ec_index; ret = dog_exec_req(&vnode->node->nid, &hdr, buf); if (ret < 0) exit(EXIT_SYSFAIL); if (rsp->result != SD_RES_SUCCESS) { sd_err("FATAL: failed to write %"PRIx64", %s", oid, sd_strerror(rsp->result)); exit(EXIT_FAILURE); } } struct vdi_check_work { struct vdi_check_info *info; const struct sd_vnode *vnode; uint8_t hash[SHA1_DIGEST_SIZE]; uint8_t ec_index; uint8_t *buf; bool object_found; struct work work; }; enum vdi_check_result { VDI_CHECK_NO_OBJ_FOUND, VDI_CHECK_NO_MAJORITY_FOUND, VDI_CHECK_SUCCESS, }; struct vdi_check_info { uint64_t oid; uint8_t nr_copies; uint8_t copy_policy; uint64_t total; uint64_t *done; int refcnt; struct work_queue *wq; enum vdi_check_result result; struct vdi_check_work *majority; struct vdi_check_work vcw[0]; }; static void free_vdi_check_info(struct vdi_check_info *info) { if (info->done) { *info->done += SD_DATA_OBJ_SIZE; vdi_show_progress(*info->done, info->total); } free(info); } static void vdi_repair_work(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; void *buf; buf = read_object_from(info->majority->vnode, info->oid); write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0); free(buf); } static void vdi_repair_main(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; if (vcw->object_found) fprintf(stdout, "fixed replica %"PRIx64"\n", info->oid); else fprintf(stdout, "fixed missing %"PRIx64"\n", info->oid); info->refcnt--; if (info->refcnt == 0) free_vdi_check_info(info); } static void vdi_check_object_work(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; if (is_erasure_oid(info->oid, info->copy_policy)) { sd_init_req(&hdr, SD_OP_READ_PEER); hdr.data_length = get_store_objsize(info->copy_policy, info->oid); hdr.obj.ec_index = vcw->ec_index; hdr.epoch = sd_epoch; vcw->buf = xmalloc(hdr.data_length); } else sd_init_req(&hdr, SD_OP_GET_HASH); hdr.obj.oid = info->oid; hdr.obj.tgt_epoch = sd_epoch; ret = dog_exec_req(&vcw->vnode->node->nid, &hdr, vcw->buf); if (ret < 0) exit(EXIT_SYSFAIL); switch (rsp->result) { case SD_RES_SUCCESS: vcw->object_found = true; if (!is_erasure_oid(info->oid, info->copy_policy)) memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash)); break; case SD_RES_NO_OBJ: vcw->object_found = false; break; default: sd_err("failed to read %" PRIx64 " from %s, %s", info->oid, addr_to_str(vcw->vnode->node->nid.addr, vcw->vnode->node->nid.port), sd_strerror(rsp->result)); exit(EXIT_FAILURE); } } static void check_replicatoin_object(struct vdi_check_info *info) { if (info->majority == NULL) { switch (info->result) { case VDI_CHECK_NO_OBJ_FOUND: sd_err("no node has %" PRIx64, info->oid); break; case VDI_CHECK_NO_MAJORITY_FOUND: sd_err("no majority of %" PRIx64, info->oid); break; default: sd_err("unknown result of vdi check: %d", info->result); exit(EXIT_FAILURE); break; } /* do nothing */ return; } for (int i = 0; i < info->nr_copies; i++) { if (&info->vcw[i] == info->majority) continue; /* need repair when object not found or consistency broken */ if (!info->vcw[i].object_found || memcmp(info->majority->hash, info->vcw[i].hash, sizeof(info->majority->hash)) != 0) { info->vcw[i].work.fn = vdi_repair_work; info->vcw[i].work.done = vdi_repair_main; info->refcnt++; queue_work(info->wq, &info->vcw[i].work); } } } static void check_erasure_object(struct vdi_check_info *info) { int d = 0, p = 0, i, j, k; int dp = ec_policy_to_dp(info->copy_policy, &d, &p); struct fec *ctx = ec_init(d, dp); int miss_idx[dp], input_idx[dp]; uint64_t oid = info->oid; size_t len = get_store_objsize(info->copy_policy, oid); char *obj = xmalloc(len); uint8_t *input[dp]; for (i = 0; i < dp; i++) miss_idx[i] = -1; for (i = 0, j = 0, k = 0; i < info->nr_copies; i++) if (!info->vcw[i].object_found) { miss_idx[j++] = i; } else { input_idx[k] = i; input[k] = info->vcw[i].buf; k++; } if (!j) { /* No object missing */ int idx[d]; for (i = 0; i < d; i++) idx[i] = i; for (k = 0; k < p; k++) { uint8_t *ds[d]; for (j = 0; j < d; j++) ds[j] = info->vcw[j].buf; ec_decode_buffer(ctx, ds, idx, obj, d + k); if (memcmp(obj, info->vcw[d + k].buf, len) != 0) { /* TODO repair the inconsistency */ sd_err("object %"PRIx64" is inconsistent", oid); goto out; } } } else if (j > p) { sd_err("failed to rebuild object %"PRIx64". %d copies get " "lost, more than %d", oid, j, p); goto out; } else { for (k = 0; k < j; k++) { int m = miss_idx[k]; uint8_t *ds[d]; for (i = 0; i < d; i++) ds[i] = input[i]; ec_decode_buffer(ctx, ds, input_idx, obj, m); write_object_to(info->vcw[m].vnode, oid, obj, true, info->vcw[m].ec_index); fprintf(stdout, "fixed missing %"PRIx64", " "copy index %d\n", info->oid, m); } } out: for (i = 0; i < dp; i++) free(info->vcw[i].buf); free(obj); ec_destroy(ctx); } static void vote_majority_object(struct vdi_check_info *info) { /* * Voting majority object from existing ones. * * The linear majority vote algorithm by Boyer and Moore is used: * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/ */ int count = 0, nr_live_copies = 0; struct vdi_check_work *majority = NULL; /* step 1 */ for (int i = 0; i < info->nr_copies; i++) { struct vdi_check_work *vcw = &info->vcw[i]; if (!vcw->object_found) continue; nr_live_copies++; if (!count) majority = vcw; if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash))) count++; else count--; } /* step 2 */ if (count > 0 && count <= nr_live_copies / 2) { count = 0; for (int i = 0; i < info->nr_copies; i++) { struct vdi_check_work *vcw = &info->vcw[i]; if (!vcw->object_found) continue; if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash))) count++; } } if (!majority) info->result = VDI_CHECK_NO_OBJ_FOUND; else if (count > nr_live_copies / 2) info->result = VDI_CHECK_SUCCESS; else { /* no majority found */ majority = NULL; info->result = VDI_CHECK_NO_MAJORITY_FOUND; } info->majority = majority; } static void vdi_check_object_main(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; info->refcnt--; if (info->refcnt > 0) return; if (is_erasure_oid(info->oid, info->copy_policy)) check_erasure_object(info); else { vote_majority_object(info); check_replicatoin_object(info); } if (info->refcnt == 0) free_vdi_check_info(info); } static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid, uint64_t *done, struct work_queue *wq, int nr_copies) { struct vdi_check_info *info; const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES]; info = xzalloc(sizeof(*info) + sizeof(info->vcw[0]) * nr_copies); info->oid = oid; info->nr_copies = nr_copies; info->total = inode->vdi_size; info->done = done; info->wq = wq; info->copy_policy = inode->copy_policy; oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes); for (int i = 0; i < nr_copies; i++) { info->vcw[i].info = info; info->vcw[i].ec_index = i; info->vcw[i].vnode = tgt_vnodes[i]; info->vcw[i].work.fn = vdi_check_object_work; info->vcw[i].work.done = vdi_check_object_main; info->refcnt++; queue_work(info->wq, &info->vcw[i].work); } } struct check_arg { const struct sd_inode *inode; uint64_t *done; struct work_queue *wq; int nr_copies; }; static void check_cb(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct check_arg *carg = arg; uint64_t oid; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (ext->vdi_id) { oid = vid_to_data_oid(ext->vdi_id, ext->idx); *(carg->done) = (uint64_t)ext->idx * SD_DATA_OBJ_SIZE; vdi_show_progress(*(carg->done), carg->inode->vdi_size); queue_vdi_check_work(carg->inode, oid, NULL, carg->wq, carg->nr_copies); } } } int do_vdi_check(const struct sd_inode *inode) { uint32_t max_idx; uint64_t done = 0, oid; uint32_t vid; struct work_queue *wq; int nr_copies = min((int)inode->nr_copies, sd_zones_nr); if (0 < inode->copy_policy && sd_zones_nr < nr_copies) { sd_err("ABORT: Not enough active zones for consistency-checking" " erasure coded VDI"); return EXIT_FAILURE; } wq = create_work_queue("vdi check", WQ_DYNAMIC); init_fec(); queue_vdi_check_work(inode, vid_to_vdi_oid(inode->vdi_id), NULL, wq, nr_copies); if (inode->store_policy == 0) { max_idx = count_data_objs(inode); vdi_show_progress(done, inode->vdi_size); for (uint32_t idx = 0; idx < max_idx; idx++) { vid = INODE_GET_VID(inode, idx); if (vid) { oid = vid_to_data_oid(vid, idx); queue_vdi_check_work(inode, oid, &done, wq, nr_copies); } else { done += SD_DATA_OBJ_SIZE; vdi_show_progress(done, inode->vdi_size); } } } else { struct check_arg arg = {inode, &done, wq, nr_copies}; traverse_btree(dog_bnode_reader, inode, check_cb, &arg); vdi_show_progress(inode->vdi_size, inode->vdi_size); } work_queue_wait(wq); fprintf(stdout, "finish check&repair %s\n", inode->name); return EXIT_SUCCESS; } static int vdi_check(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret; struct sd_inode *inode = xmalloc(sizeof(*inode)); ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) { sd_err("FATAL: no inode objects"); return ret; } return do_vdi_check(inode); } /* vdi backup format */ #define VDI_BACKUP_FORMAT_VERSION 1 #define VDI_BACKUP_MAGIC 0x11921192 struct backup_hdr { uint32_t version; uint32_t magic; }; struct obj_backup { uint32_t idx; uint32_t offset; uint32_t length; uint32_t reserved; uint8_t data[SD_DATA_OBJ_SIZE]; }; /* discards redundant area from backup data */ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data) { uint8_t *p1, *p2; p1 = backup->data; p2 = from_data; while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) { p1 += SECTOR_SIZE; p2 += SECTOR_SIZE; backup->offset += SECTOR_SIZE; backup->length -= SECTOR_SIZE; } p1 = backup->data + SD_DATA_OBJ_SIZE - SECTOR_SIZE; p2 = from_data + SD_DATA_OBJ_SIZE - SECTOR_SIZE; while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) { p1 -= SECTOR_SIZE; p2 -= SECTOR_SIZE; backup->length -= SECTOR_SIZE; } } static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid, struct obj_backup *backup) { int ret; uint8_t *from_data = xzalloc(SD_DATA_OBJ_SIZE); backup->idx = idx; backup->offset = 0; backup->length = SD_DATA_OBJ_SIZE; if (to_vid) { ret = dog_read_object(vid_to_data_oid(to_vid, idx), backup->data, SD_DATA_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx32 ", %d", to_vid, idx); return EXIT_FAILURE; } } else memset(backup->data, 0, SD_DATA_OBJ_SIZE); if (from_vid) { ret = dog_read_object(vid_to_data_oid(from_vid, idx), from_data, SD_DATA_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx32 ", %d", from_vid, idx); return EXIT_FAILURE; } } compact_obj_backup(backup, from_data); free(from_data); return EXIT_SUCCESS; } static int vdi_backup(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret = EXIT_SUCCESS; uint32_t idx, nr_objs; struct sd_inode *from_inode = xzalloc(sizeof(*from_inode)); struct sd_inode *to_inode = xzalloc(sizeof(*to_inode)); struct backup_hdr hdr = { .version = VDI_BACKUP_FORMAT_VERSION, .magic = VDI_BACKUP_MAGIC, }; struct obj_backup *backup = xzalloc(sizeof(*backup)); if ((!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) || (!vdi_cmd_data.from_snapshot_id && !vdi_cmd_data.from_snapshot_tag[0])) { sd_err("Please specify snapshots with '-F' and '-s' options"); ret = EXIT_USAGE; goto out; } ret = read_vdi_obj(vdiname, vdi_cmd_data.from_snapshot_id, vdi_cmd_data.from_snapshot_tag, NULL, from_inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, to_inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; nr_objs = count_data_objs(to_inode); ret = xwrite(STDOUT_FILENO, &hdr, sizeof(hdr)); if (ret < 0) { sd_err("failed to write backup header, %m"); ret = EXIT_SYSFAIL; goto out; } for (idx = 0; idx < nr_objs; idx++) { uint32_t from_vid = INODE_GET_VID(from_inode, idx); uint32_t to_vid = INODE_GET_VID(to_inode, idx); if (to_vid == 0 && from_vid == 0) continue; ret = get_obj_backup(idx, from_vid, to_vid, backup); if (ret != EXIT_SUCCESS) goto out; if (backup->length == 0) continue; ret = xwrite(STDOUT_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret < 0) { sd_err("failed to write backup data, %m"); ret = EXIT_SYSFAIL; goto out; } ret = xwrite(STDOUT_FILENO, backup->data + backup->offset, backup->length); if (ret < 0) { sd_err("failed to write backup data, %m"); ret = EXIT_SYSFAIL; goto out; } } /* write end marker */ memset(backup, 0, sizeof(*backup) - sizeof(backup->data)); backup->idx = UINT32_MAX; ret = xwrite(STDOUT_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret < 0) { sd_err("failed to write end marker, %m"); ret = EXIT_SYSFAIL; goto out; } fsync(STDOUT_FILENO); ret = EXIT_SUCCESS; out: free(from_inode); free(to_inode); free(backup); return ret; } /* restore backup data to vdi */ static int restore_obj(struct obj_backup *backup, uint32_t vid, struct sd_inode *parent_inode) { int ret; uint32_t parent_vid = INODE_GET_VID(parent_inode, backup->idx); uint64_t parent_oid = 0; if (parent_vid) parent_oid = vid_to_data_oid(parent_vid, backup->idx); /* send a copy-on-write request */ ret = dog_write_object(vid_to_data_oid(vid, backup->idx), parent_oid, backup->data, backup->length, backup->offset, 0, parent_inode->nr_copies, parent_inode->copy_policy, true, true); if (ret != SD_RES_SUCCESS) return ret; return dog_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), SD_INODE_HEADER_SIZE + sizeof(vid) * backup->idx, 0, parent_inode->nr_copies, parent_inode->copy_policy, false, true); } static uint32_t do_restore(const char *vdiname, int snapid, const char *tag) { int ret; uint32_t vid; struct backup_hdr hdr; struct obj_backup *backup = xzalloc(sizeof(*backup)); struct sd_inode *inode = xzalloc(sizeof(*inode)); ret = xread(STDIN_FILENO, &hdr, sizeof(hdr)); if (ret != sizeof(hdr)) sd_err("failed to read backup header, %m"); if (hdr.version != VDI_BACKUP_FORMAT_VERSION || hdr.magic != VDI_BACKUP_MAGIC) { sd_err("The backup file is corrupted"); ret = EXIT_SYSFAIL; goto out; } ret = read_vdi_obj(vdiname, snapid, tag, NULL, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid, false, inode->nr_copies, inode->copy_policy, inode->store_policy); if (ret != EXIT_SUCCESS) { sd_err("Failed to read VDI"); goto out; } while (true) { ret = xread(STDIN_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret != sizeof(*backup) - sizeof(backup->data)) { sd_err("failed to read backup data"); ret = EXIT_SYSFAIL; break; } if (backup->idx == UINT32_MAX) { ret = EXIT_SUCCESS; break; } ret = xread(STDIN_FILENO, backup->data, backup->length); if (ret != backup->length) { sd_err("failed to read backup data"); ret = EXIT_SYSFAIL; break; } ret = restore_obj(backup, vid, inode); if (ret != SD_RES_SUCCESS) { sd_err("failed to restore backup"); do_vdi_delete(vdiname, 0, NULL); ret = EXIT_FAILURE; break; } } out: free(backup); free(inode); return ret; } static int vdi_restore(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret; char buf[SD_INODE_HEADER_SIZE] = {0}; struct sd_inode *inode_for_check = xzalloc(sizeof(*inode_for_check)); struct sd_inode *current_inode = xzalloc(sizeof(*current_inode)); struct sd_inode *parent_inode = (struct sd_inode *)buf; bool need_current_recovery = false; if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("We can restore a backup file only to snapshots"); sd_err("Please specify the '-s' option"); ret = EXIT_USAGE; goto out; } ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, inode_for_check, SD_INODE_SIZE); if (ret != SD_RES_SUCCESS) { sd_err("Snapshot ID %d or tag %s doesn't exist", vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); goto out; } /* * delete the current vdi temporarily first to avoid making * the current state become snapshot */ ret = read_vdi_obj(vdiname, 0, "", NULL, current_inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = dog_read_object(vid_to_vdi_oid(current_inode->parent_vdi_id), parent_inode, SD_INODE_HEADER_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { printf("error\n"); goto out; } if (is_stdin_console()) { sd_err("stdin must be pipe"); ret = EXIT_USAGE; goto out; } ret = do_vdi_delete(vdiname, 0, NULL); if (ret != EXIT_SUCCESS) { sd_err("Failed to delete the current state"); goto out; } need_current_recovery = true; /* restore backup data */ ret = do_restore(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); out: if (need_current_recovery) { int recovery_ret; /* recreate the current vdi object */ recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size, current_inode->parent_vdi_id, NULL, true, current_inode->nr_copies, current_inode->copy_policy, current_inode->store_policy); if (recovery_ret != EXIT_SUCCESS) { sd_err("failed to resume the current vdi"); ret = recovery_ret; } } free(current_inode); free(inode_for_check); return ret; } static int vdi_cache_flush(int argc, char **argv) { const char *vdiname; struct sd_req hdr; uint32_t vid; int ret = EXIT_SUCCESS; if (optind < argc) vdiname = argv[optind++]; else { sd_err("please specify VDI name"); ret = EXIT_FAILURE; goto out; } ret = find_vdi_name(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); ret = EXIT_FAILURE; goto out; } sd_init_req(&hdr, SD_OP_FLUSH_VDI); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&sd_nid, &hdr); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } out: return ret; } static int vdi_cache_delete(int argc, char **argv) { const char *vdiname; struct sd_req hdr; uint32_t vid; int ret = EXIT_SUCCESS; if (optind < argc) vdiname = argv[optind++]; else { sd_err("please specify VDI name"); ret = EXIT_FAILURE; goto out; } ret = find_vdi_name(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); ret = EXIT_FAILURE; goto out; } sd_init_req(&hdr, SD_OP_DELETE_CACHE); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&sd_nid, &hdr); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } out: return ret; } static int vid_to_name_tag(uint32_t vid, char *name, char *tag) { struct sd_inode inode; int ret; ret = dog_read_object(vid_to_vdi_oid(vid), &inode, SD_INODE_HEADER_SIZE, 0, true); if (ret != SD_RES_SUCCESS) return ret; pstrcpy(name, SD_MAX_VDI_LEN, inode.name); pstrcpy(tag, SD_MAX_VDI_TAG_LEN, inode.tag); return SD_RES_SUCCESS; } static int vdi_cache_info(int argc, char **argv) { struct object_cache_info info = {}; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret, i; sd_init_req(&hdr, SD_OP_GET_CACHE_INFO); hdr.data_length = sizeof(info); ret = dog_exec_req(&sd_nid, &hdr, &info); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to get cache infomation: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } fprintf(stdout, "Name\tTag\tTotal\tDirty\tClean\n"); for (i = 0; i < info.count; i++) { uint64_t total = info.caches[i].total * SD_DATA_OBJ_SIZE, dirty = info.caches[i].dirty * SD_DATA_OBJ_SIZE, clean = total - dirty; char name[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; ret = vid_to_name_tag(info.caches[i].vid, name, tag); if (ret != SD_RES_SUCCESS) return EXIT_FAILURE; fprintf(stdout, "%s\t%s\t%s\t%s\t%s\n", name, tag, strnumber(total), strnumber(dirty), strnumber(clean)); } fprintf(stdout, "\nCache size %s, used %s, %s\n", strnumber(info.size), strnumber(info.used), info.directio ? "directio" : "non-directio"); return EXIT_SUCCESS; } static int vdi_cache_purge(int argc, char **argv) { const char *vdiname; struct sd_req hdr; uint32_t vid; int ret = EXIT_SUCCESS; sd_init_req(&hdr, SD_OP_CACHE_PURGE); if (optind < argc) { vdiname = argv[optind++]; ret = find_vdi_name(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); ret = EXIT_FAILURE; goto out; } hdr.obj.oid = vid_to_vdi_oid(vid); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = 0; } else { confirm("This operation purges the cache of all the vdi" ". Continue? [yes/no]: "); } ret = send_light_req(&sd_nid, &hdr); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } out: return ret; } static struct subcommand vdi_cache_cmd[] = { {"flush", NULL, NULL, "flush the cache of the vdi specified.", NULL, CMD_NEED_ARG, vdi_cache_flush}, {"delete", NULL, NULL, "delete the cache of the vdi specified in all nodes.", NULL, CMD_NEED_ARG, vdi_cache_delete}, {"info", NULL, NULL, "show usage of the cache", NULL, 0, vdi_cache_info}, {"purge", NULL, NULL, "purge the cache of all vdi (no flush)", NULL, 0, vdi_cache_purge}, {NULL,}, }; static int vdi_cache(int argc, char **argv) { return do_generic_subcommand(vdi_cache_cmd, argc, argv); } static struct subcommand vdi_cmd[] = { {"check", "", "saph", "check and repair image's consistency", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_check, vdi_options}, {"create", " ", "Pycaphrv", "create an image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_create, vdi_options}, {"snapshot", "", "saphrv", "create a snapshot", NULL, CMD_NEED_ARG, vdi_snapshot, vdi_options}, {"clone", " ", "sPcaphrv", "clone an image", NULL, CMD_NEED_ARG, vdi_clone, vdi_options}, {"delete", "", "saph", "delete an image", NULL, CMD_NEED_ARG, vdi_delete, vdi_options}, {"rollback", "", "saphfrv", "rollback to a snapshot", NULL, CMD_NEED_ARG, vdi_rollback, vdi_options}, {"list", "[vdiname]", "aprh", "list images", NULL, 0, vdi_list, vdi_options}, {"tree", NULL, "aph", "show images in tree view format", NULL, 0, vdi_tree, vdi_options}, {"graph", NULL, "aph", "show images in Graphviz dot format", NULL, 0, vdi_graph, vdi_options}, {"object", "", "isaph", "show object information in the image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_object, vdi_options}, {"track", "", "isapho", "show the object epoch trace in the image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_track, vdi_options}, {"setattr", " [value]", "dxaph", "set a VDI attribute", NULL, CMD_NEED_ARG, vdi_setattr, vdi_options}, {"getattr", " ", "aph", "get a VDI attribute", NULL, CMD_NEED_ARG, vdi_getattr, vdi_options}, {"resize", " ", "aph", "resize an image", NULL, CMD_NEED_ARG, vdi_resize, vdi_options}, {"read", " [ []]", "saph", "read data from an image", NULL, CMD_NEED_ARG, vdi_read, vdi_options}, {"write", " [ []]", "apwh", "write data to an image", NULL, CMD_NEED_ARG, vdi_write, vdi_options}, {"backup", " ", "sFaph", "create an incremental backup between two snapshots", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_backup, vdi_options}, {"restore", " ", "saph", "restore snapshot images from a backup", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_restore, vdi_options}, {"cache", "", "saph", "Run 'dog vdi cache' for more information", vdi_cache_cmd, CMD_NEED_ARG, vdi_cache, vdi_options}, {NULL,}, }; static int vdi_parser(int ch, const char *opt) { char *p; switch (ch) { case 'P': vdi_cmd_data.prealloc = true; break; case 'i': vdi_cmd_data.index = strtol(opt, &p, 10); if (opt == p) { sd_err("The index must be an integer"); exit(EXIT_FAILURE); } break; case 's': vdi_cmd_data.snapshot_id = strtol(opt, &p, 10); if (opt == p || *p != '\0') { vdi_cmd_data.snapshot_id = 0; pstrcpy(vdi_cmd_data.snapshot_tag, sizeof(vdi_cmd_data.snapshot_tag), opt); } else if (vdi_cmd_data.snapshot_id == 0) { fprintf(stderr, "The snapshot id must be larger than zero\n"); exit(EXIT_FAILURE); } break; case 'x': vdi_cmd_data.exclusive = true; break; case 'd': vdi_cmd_data.delete = true; break; case 'w': vdi_cmd_data.writeback = true; break; case 'c': vdi_cmd_data.nr_copies = parse_copy(opt, &vdi_cmd_data.copy_policy); if (!vdi_cmd_data.nr_copies) { sd_err("Invalid parameter %s\n" "To create replicated vdi, set -c x\n" " x(1 to %d) - number of replicated copies\n" "To create erasure coded vdi, set -c x:y\n" " x(2,4,8,16) - number of data strips\n" " y(1 to 15) - number of parity strips", opt, SD_MAX_COPIES); exit(EXIT_FAILURE); } break; case 'F': vdi_cmd_data.from_snapshot_id = strtol(opt, &p, 10); if (opt == p || *p != '\0') { vdi_cmd_data.from_snapshot_id = 0; pstrcpy(vdi_cmd_data.from_snapshot_tag, sizeof(vdi_cmd_data.from_snapshot_tag), opt); } break; case 'f': vdi_cmd_data.force = true; break; case 'y': vdi_cmd_data.store_policy = 1; break; case 'o': vdi_cmd_data.oid = strtoll(opt, &p, 16); if (opt == p) { sd_err("object id must be a hex integer"); exit(EXIT_FAILURE); } break; } return 0; } struct command vdi_command = { "vdi", vdi_cmd, vdi_parser }; sheepdog-0.8.3/include/000077500000000000000000000000001237656255000147565ustar00rootroot00000000000000sheepdog-0.8.3/include/Makefile.am000066400000000000000000000004261237656255000170140ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in config.h.in noinst_HEADERS = bitops.h event.h logger.h sheepdog_proto.h util.h \ list.h net.h sheep.h exits.h strbuf.h rbtree.h \ sha1.h option.h internal_proto.h shepherd.h work.h \ sockfd_cache.h compiler.h fec.h sheepdog-0.8.3/include/bitops.h000066400000000000000000000111441237656255000164300ustar00rootroot00000000000000#ifndef __BITOPS_H__ #define __BITOPS_H__ #include #include "util.h" #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define DECLARE_BITMAP(name, bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) #define BITS_PER_UINT64 (BITS_PER_BYTE * sizeof(uint64_t)) #define __ffs(x) (x ? __builtin_ffsl(x) - 1 : 0) #define ffz(x) __ffs(~(x)) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* * Iterate over a bitmap * * @nr: the bit number to use as a loop cursor * @addr: the bitmap you iterate over * @bits: the number of bits this bitmap contains */ #define FOR_EACH_BIT(nr, addr, bits) \ for (nr = find_next_bit((addr), (bits), 0); \ nr < (bits); \ nr = find_next_bit((addr), (bits), nr + 1)) /* * Change the size of allocated bitmap * * This doesn't change the contents of the old bitmap pointed to by `ptr`, and * initializes the newly allocated area with zeros. */ static inline unsigned long *alloc_bitmap(unsigned long *old_bmap, size_t old_bits, size_t new_bits) { size_t old_size = BITS_TO_LONGS(old_bits) * sizeof(long); size_t new_size = BITS_TO_LONGS(new_bits) * sizeof(long); unsigned long *new_bmap = xrealloc(old_bmap, new_size); if (old_bits < new_bits) memset((char *)new_bmap + old_size, 0, new_size - old_size); return new_bmap; } static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp |= ~0UL >> (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) goto found_first; if (~tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { tmp = *(p++); if (~tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp |= ~0UL << size; if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + ffz(tmp); } static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { tmp = *(p++); if (tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } static inline void set_bit(int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); } static inline void set_bit_64(int nr, uint64_t *addr) { addr[nr / BITS_PER_UINT64] |= 1ULL << (nr % BITS_PER_UINT64); } static inline void atomic_set_bit(int nr, unsigned long *addr) { uatomic_or(addr + nr / BITS_PER_LONG, 1UL << (nr % BITS_PER_LONG)); } static inline int test_bit(unsigned int nr, const unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; } static inline void clear_bit(unsigned int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG)); } /* * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #if SIZEOF_LONG == 4 static __always_inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (x == 0) return 0; if (h) return 64 - __builtin_clzl(h); return 32 - __builtin_clzl(x); } #elif SIZEOF_LONG == 8 static __always_inline int fls64(uint64_t x) { if (x == 0) return 0; return 64 - __builtin_clzl(x); } #else #error SIZEOF_LONG not 4 or 8 #endif #endif /* __BITOPS_H__ */ sheepdog-0.8.3/include/compiler.h000066400000000000000000000101141237656255000167360ustar00rootroot00000000000000/* * Copyright (C) 2009-2013 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef SD_COMPILER_H #define SD_COMPILER_H #include #include #include #include #include #include #include "config.h" #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define __LOCAL(var, line) __ ## var ## line #define _LOCAL(var, line) __LOCAL(var, line) #define LOCAL(var) _LOCAL(var, __LINE__) #define container_of(ptr, type, member) ({ \ const typeof(((type *)0)->member) *__mptr = (ptr); \ (type *)((char *)__mptr - offsetof(type, member)); }) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define __packed __attribute((packed)) #define asmlinkage __attribute__((regparm(0))) #define __printf(a, b) __attribute__((format(printf, a, b))) /* Force a compilation error if the condition is true */ #define BUILD_BUG_ON(condition) ((void)sizeof(struct { int: -!!(condition); })) #ifdef HAVE_SYS_SIGNALFD_H #include #else #define SFD_NONBLOCK (04000) struct signalfd_siginfo { uint32_t ssi_signo; int32_t ssi_errno; int32_t ssi_code; uint32_t ssi_pid; uint32_t ssi_uid; int32_t ssi_fd; uint32_t ssi_tid; uint32_t ssi_band; uint32_t ssi_overrun; uint32_t ssi_trapno; int32_t ssi_status; int32_t ssi_int; uint64_t ssi_ptr; uint64_t ssi_utime; uint64_t ssi_stime; uint64_t ssi_addr; uint16_t ssi_addr_lsb; uint8_t __pad[46]; }; static inline int signalfd(int __fd, const sigset_t *__mask, int __flags) { return syscall(__NR_signalfd4, __fd, __mask, _NSIG / 8, __flags); } #endif #ifdef HAVE_SYS_EVENTFD_H #include #else #define EFD_SEMAPHORE (1) #define EFD_NONBLOCK (04000) #define eventfd_t uint64_t static inline int eventfd_write(int fd, eventfd_t value) { return write(fd, &value, sizeof(eventfd_t)) != sizeof(eventfd_t) ? -1 : 0; } static inline int eventfd_read(int fd, eventfd_t *value) { return read(fd, value, sizeof(eventfd_t)) != sizeof(eventfd_t) ? -1 : 0; } static inline int eventfd(unsigned int initval, int flags) { return syscall(__NR_eventfd2, initval, flags); } #endif #ifdef HAVE_SYS_TIMERFD_H #include #else #define TFD_NONBLOCK (04000) static inline int timerfd_create(clockid_t __clock_id, int __flags) { return syscall(__NR_timerfd_create, __clock_id, __flags); } static inline int timerfd_settime(int __ufd, int __flags, __const struct itimerspec *__utmr, struct itimerspec *__otmr) { return syscall(__NR_timerfd_settime, __ufd, __flags, __utmr, __otmr); } #endif #ifndef HAVE_FALLOCATE static inline int fallocate(int fd, int mode, __off_t offset, __off_t len) { return syscall(__NR_fallocate, fd, mode, offset, len); } #endif #ifdef __x86_64__ #define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ #define X86_FEATURE_OSXSAVE (4 * 32 + 27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4 * 32 + 28) /* Advanced Vector Extensions */ #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 #define XCR_XFEATURE_ENABLED_MASK 0x00000000 static inline int cpu_has(int flag) { uint32_t eax, ebx, ecx, edx; eax = (flag & 0x100) ? 7 : (flag & 0x20) ? 0x80000001 : 1; ecx = 0; asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); return ((flag & 0x100 ? ebx : (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1; } static inline uint64_t xgetbv(uint32_t idx) { uint32_t eax, edx; asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ : "=a" (eax), "=d" (edx) : "c" (idx)); return eax + ((uint64_t)edx << 32); } #define cpu_has_ssse3 cpu_has(X86_FEATURE_SSSE3) #define cpu_has_avx cpu_has(X86_FEATURE_AVX) #define cpu_has_osxsave cpu_has(X86_FEATURE_OSXSAVE) #endif /* __x86_64__ */ #endif /* SD_COMPILER_H */ sheepdog-0.8.3/include/event.h000066400000000000000000000014421237656255000162510ustar00rootroot00000000000000#ifndef __EVENT_H__ #define __EVENT_H__ #include "list.h" #include struct event_info; typedef void (*event_handler_t)(int fd, int events, void *data); int init_event(int nr); int register_event_prio(int fd, event_handler_t h, void *data, int prio); void unregister_event(int fd); int modify_event(int fd, unsigned int events); void event_loop(int timeout); void event_loop_prio(int timeout); void event_force_refresh(void); struct timer { void (*callback)(void *); void *data; }; void add_timer(struct timer *t, unsigned int mseconds); #define EVENT_PRIO_MAX INT_MAX #define EVENT_PRIO_DEFAULT 0 #define EVENT_PRIO_MIN INT_MIN static inline int register_event(int fd, event_handler_t h, void *data) { return register_event_prio(fd, h, data, EVENT_PRIO_DEFAULT); } #endif sheepdog-0.8.3/include/exits.h000066400000000000000000000010041237656255000162560ustar00rootroot00000000000000#ifndef __EXITS_H__ #define __EXITS_H__ #define EXIT_SUCCESS 0 /* command executed successfully */ #define EXIT_FAILURE 1 /* command failed to execute */ #define EXIT_SYSFAIL 2 /* something is wrong with the cluster or local host */ #define EXIT_EXISTS 3 /* the object already exists so cannot be created */ #define EXIT_FULL 4 /* no more space is left in the cluster */ #define EXIT_MISSING 5 /* the specified object does not exist */ #define EXIT_USAGE 64 /* invalid command, arguments or options */ #endif sheepdog-0.8.3/include/fec.h000066400000000000000000000151401237656255000156650ustar00rootroot00000000000000#ifndef __FEC_H__ #define __FEC_H__ /* * zfec -- fast forward error correction library * * Copyright (C) 2007-2008 Allmyds, Inc. * Author: Zooko Wilcox-O'Hearn * * This file is part of zfec. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * Much of this work is derived from the "fec" software by Luigi Rizzo, et * al., the copyright notice and licence terms of which are included below * for reference. * * fec.h -- forward error correction based on Vandermonde matrices * 980614 * (C) 1997-98 Luigi Rizzo (luigi@iet.unipi.it) * * Portions derived from code by Phil Karn (karn@ka9q.ampr.org), * Robert Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari * Thirumoorthy (harit@spectra.eng.hawaii.edu), Aug 1995 * * Modifications by Dan Rubenstein (see Modifications.txt for * their description. * Modifications (C) 1998 Dan Rubenstein (drubenst@cs.umass.edu) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. */ #include #include #include "util.h" #include "sheepdog_proto.h" struct fec { unsigned long magic; unsigned short d, dp; /* parameters of the code */ uint8_t *enc_matrix; }; void init_fec(void); /* * param d the number of blocks required to reconstruct * param dp the total number of blocks created */ struct fec *fec_new(unsigned short d, unsigned short dp); void fec_free(struct fec *p); /* * @param inpkts the "primary blocks" i.e. the chunks of the input data * @param fecs buffers into which the secondary blocks will be written * @param block_nums the numbers of the desired check blocks (the id >= k) which * fec_encode() will produce and store into the buffers of the fecs parameter * @param num_block_nums the length of the block_nums array * @param sz size of a packet in bytes */ void fec_encode(const struct fec *code, const uint8_t *const *const src, uint8_t *const *const fecs, const int *const block_nums, size_t num_block_nums, size_t sz); /* * @param inpkts an array of packets (size k); If a primary block, i, is present * then it must be at index i. Secondary blocks can appear anywhere. * @param outpkts an array of buffers into which the reconstructed output * packets will be written (only packets which are not present in the inpkts * input will be reconstructed and written to outpkts) * @param index an array of the blocknums of the packets in inpkts * @param sz size of a packet in bytes */ void fec_decode(const struct fec *code, const uint8_t *const *const inpkts, uint8_t *const *const outpkts, const int *const index, size_t sz); /* * data stripe <= 1K is the safe value to run VM after some experimentations. * * Though most OS's file system will operate on 4K block, some softwares like * grub will operate on 512 bytes and Linux kernel itself will sometimes * operate on 1K blocks. I have tried 4K alignement and centos6 installation * failed (grub got screwed) and 1K is probably the biggest value if we want * VM to run on erasure coded volume. */ #define SD_EC_DATA_STRIPE_SIZE (1024) /* 1K */ #define SD_EC_NR_STRIPE_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_DATA_STRIPE_SIZE) #define SD_EC_MAX_STRIP (16) static inline int ec_policy_to_dp(uint8_t policy, int *d, int *p) { int ed = 0, ep = 0; ep = policy & 0b1111; ed = policy >> 4; if (unlikely(!ep)) panic("invalid policy %d", policy); if (d) *d = ed * 2; if (p) *p = ep; return ed * 2 + ep; } /* * Stripe: data strips + parity strips, spread on all replica * DS: data strip * PS: parity strip * R: Replica * * +--------------------stripe ----------------------+ * v data stripe parity stripe v * +----+----+----+----+----+-----+----+----+-----+----+ * | ds | ds | ds | ds | ds | ... | ps | ps | ... | ps | * +----+----+----+----+----+-----+----+----+-----+----+ * | .. | .. | .. | .. | .. | ... | .. | .. | ... | .. | * +----+----+----+----+----+ ... +----+----+-----+----+ * R1 R2 R3 R4 R5 ... Rn Rn+1 Rn+2 Rn+3 */ /* Return the erasure code context to encode|decode */ static inline struct fec *ec_init(int d, int dp) { return fec_new(d, dp); } /* * This function decodes the data strips and return the parity strips * * @ds: data strips to generate parity strips * @ps: parity strips to return */ static inline void ec_encode(struct fec *ctx, const uint8_t *ds[], uint8_t *ps[]) { int p = ctx->dp - ctx->d; int pidx[p]; for (int i = 0; i < p; i++) pidx[i] = ctx->d + i; fec_encode(ctx, ds, ps, pidx, p, SD_EC_DATA_STRIPE_SIZE / ctx->d); } /* * This function takes input strips and return the lost strip * * @input: strips (either ds or ps) that are used to generate lost strips * @inidx: indexes of each input strip in the whole stripe, must be in numeric * order such as { 0, 2, 4, 5 } * @output: the lost ds or ps to return * @idx: index of output which is lost */ void ec_decode(struct fec *ctx, const uint8_t *input[], const int inidx[], uint8_t output[], int idx); /* Destroy the erasure code context */ static inline void ec_destroy(struct fec *ctx) { fec_free(ctx); } void ec_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[], char *buf, int idx); #endif sheepdog-0.8.3/include/internal_proto.h000066400000000000000000000200531237656255000201660ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __INTERNAL_PROTO_H__ #define __INTERNAL_PROTO_H__ /* * This file specified the sheepdog-internal protocol, which is spoken between * sheepdog daemons, as well as between dog and sheepdog daemon for internal * operations. */ #include #include #include "sheepdog_proto.h" #include "rbtree.h" #include "fec.h" #define SD_SHEEP_PROTO_VER 0x09 #define SD_DEFAULT_COPIES 3 /* * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and * (SD_EC_MAX_STRIP - 1) for parity strips */ #define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) /* * The max number of nodes sheep daemon can support is constrained by * the number of nodes in the struct cluster_info, but the actual max * number is determined by the cluster driver because we have to pass * sys->cinfo around the cluster to handle membership management. * * Currently, only zookeeper driver support SD_MAX_NODES nodes because * its message buffer size is large enough to hold nodes[SD_MAX_NODES]. */ #define SD_MAX_NODES 6144 #define SD_DEFAULT_VNODES 128 /* * Operations with opcodes above 0x80 are considered part of the inter-sheep * include sheep-dog protocol and are versioned using SD_SHEEP_PROTO_VER * instead of SD_PROTO_VER. * * These same applies for the above 0x80 flags and error values below. */ #define SD_OP_GET_NODE_LIST 0x82 #define SD_OP_MAKE_FS 0x84 #define SD_OP_SHUTDOWN 0x85 #define SD_OP_STAT_SHEEP 0x86 #define SD_OP_STAT_CLUSTER 0x87 #define SD_OP_GET_VDI_ATTR 0x89 #define SD_OP_FORCE_RECOVER 0x8a #define SD_OP_GET_STORE_LIST 0x90 #define SD_OP_SNAPSHOT 0x91 #define SD_OP_RESTORE 0x92 #define SD_OP_GET_SNAP_FILE 0x93 #define SD_OP_CLEANUP 0x94 #define SD_OP_TRACE_STATUS 0x95 #define SD_OP_TRACE_READ_BUF 0x96 #define SD_OP_STAT_RECOVERY 0x97 #define SD_OP_FLUSH_DEL_CACHE 0x98 #define SD_OP_NOTIFY_VDI_DEL 0x99 #define SD_OP_KILL_NODE 0x9A #define SD_OP_TRACE_ENABLE 0x9B #define SD_OP_TRACE_DISABLE 0x9C #define SD_OP_GET_OBJ_LIST 0xA1 #define SD_OP_GET_EPOCH 0xA2 #define SD_OP_CREATE_AND_WRITE_PEER 0xA3 #define SD_OP_READ_PEER 0xA4 #define SD_OP_WRITE_PEER 0xA5 #define SD_OP_REMOVE_PEER 0xA6 /* #define SD_OP_SET_CACHE_SIZE 0xA7 deleted */ #define SD_OP_ENABLE_RECOVER 0xA8 #define SD_OP_DISABLE_RECOVER 0xA9 #define SD_OP_GET_VDI_COPIES 0xAB #define SD_OP_COMPLETE_RECOVERY 0xAC #define SD_OP_FLUSH_NODES 0xAD #define SD_OP_FLUSH_PEER 0xAE #define SD_OP_NOTIFY_VDI_ADD 0xAF #define SD_OP_DELETE_CACHE 0xB0 #define SD_OP_MD_INFO 0xB1 #define SD_OP_MD_PLUG 0xB2 #define SD_OP_MD_UNPLUG 0xB3 #define SD_OP_GET_HASH 0xB4 #define SD_OP_REWEIGHT 0xB5 #define SD_OP_GET_CACHE_INFO 0xB6 #define SD_OP_CACHE_PURGE 0xB7 #define SD_OP_STAT 0xB8 #define SD_OP_GET_LOGLEVEL 0xB9 #define SD_OP_SET_LOGLEVEL 0xBA /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 /* flags for VDI attribute operations */ #define SD_FLAG_CMD_CREAT 0x0100 #define SD_FLAG_CMD_EXCL 0x0200 #define SD_FLAG_CMD_DEL 0x0400 /* internal error return values, must be above 0x80 */ #define SD_RES_OLD_NODE_VER 0x81 /* Request has an old epoch */ #define SD_RES_NEW_NODE_VER 0x82 /* Request has a new epoch */ #define SD_RES_NOT_FORMATTED 0x83 /* Sheepdog is not formatted yet */ #define SD_RES_INVALID_CTIME 0x84 /* Creation time of sheepdog is different */ #define SD_RES_INVALID_EPOCH 0x85 /* Invalid epoch */ #define SD_RES_NETWORK_ERROR 0x86 /* Network error between sheep */ #define SD_RES_NO_CACHE 0x87 /* No cache object found */ #define SD_RES_BUFFER_SMALL 0x88 /* The buffer is too small */ #define SD_RES_FORCE_RECOVER 0x89 /* Users should not force recover this cluster */ #define SD_RES_NO_STORE 0x8A /* No targeted backend store */ #define SD_RES_NO_SUPPORT 0x8B /* Operation is not supported by backend store */ #define SD_RES_NODE_IN_RECOVERY 0x8C /* Targeted node is in recovery */ #define SD_RES_KILLED 0x8D /* Node is killed */ #define SD_RES_OID_EXIST 0x8E /* Object ID exists already */ #define SD_RES_AGAIN 0x8F /* Ask to try again */ #define SD_RES_STALE_OBJ 0x90 /* Object may be stale */ #define SD_RES_CLUSTER_ERROR 0x91 /* Cluster driver error */ #define SD_RES_VDI_NOT_EMPTY 0x92 /* VDI is not empty */ #define SD_CLUSTER_FLAG_STRICT 0x0001 /* Strict mode for write */ enum sd_status { SD_STATUS_OK = 1, SD_STATUS_WAIT, SD_STATUS_SHUTDOWN, SD_STATUS_KILLED, }; struct node_id { uint8_t addr[16]; uint16_t port; uint8_t io_addr[16]; uint16_t io_port; uint8_t pad[4]; }; #define SD_NODE_SIZE 80 struct sd_node { struct rb_node rb; struct node_id nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; /* * A joining sheep multicasts the local cluster info. Then, the existing nodes * reply the latest cluster info which is unique among all of the nodes. */ struct cluster_info { uint8_t proto_ver; /* the version number of the internal protocol */ uint8_t disable_recovery; int16_t nr_nodes; uint32_t epoch; uint64_t ctime; uint16_t flags; uint8_t nr_copies; uint8_t copy_policy; enum sd_status status : 8; uint32_t __pad; uint8_t store[STORE_LEN]; /* Node list at cluster_info->epoch */ struct sd_node nodes[SD_MAX_NODES]; }; struct epoch_log { uint64_t ctime; uint64_t time; /* treated as time_t */ uint32_t epoch; uint32_t nr_nodes; uint8_t disable_recovery; uint8_t nr_copies; uint8_t copy_policy; uint8_t __pad[1]; char drv_name[STORE_LEN]; struct sd_node nodes[SD_MAX_NODES]; }; struct vdi_op_message { struct sd_req req; struct sd_rsp rsp; uint8_t data[0]; }; struct md_info { int idx; uint64_t free; uint64_t used; char path[PATH_MAX]; }; #define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */ struct sd_md_info { struct md_info disk[MD_MAX_DISK]; int nr; }; static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sd_node) != SD_NODE_SIZE); } enum rw_state { RW_PREPARE_LIST, /* the recovery thread is preparing object list */ RW_RECOVER_OBJ, /* the thread is recoering objects */ RW_NOTIFY_COMPLETION, /* the thread is notifying recovery completion */ }; struct recovery_state { uint8_t in_recovery; enum rw_state state; uint64_t nr_finished; uint64_t nr_total; }; #define CACHE_MAX 1024 struct cache_info { uint32_t vid; uint32_t dirty; uint32_t total; }; struct object_cache_info { uint64_t size; uint64_t used; struct cache_info caches[CACHE_MAX]; int count; uint8_t directio; }; struct sd_stat { struct s_request { uint64_t gway_active_nr; /* nr of running request */ uint64_t peer_active_nr; uint64_t gway_total_nr; /* Total nr of requests received */ uint64_t peer_total_nr; uint64_t gway_total_rx; /* Data in */ uint64_t gway_total_tx; /* Data out */ uint64_t peer_total_rx; uint64_t peer_total_tx; uint64_t gway_total_remove_nr; uint64_t gway_total_read_nr; uint64_t gway_total_write_nr; uint64_t gway_total_flush_nr; uint64_t peer_total_remove_nr; uint64_t peer_total_read_nr; uint64_t peer_total_write_nr; } r; }; #ifdef HAVE_TRACE #define TRACE_GRAPH_ENTRY 0x01 #define TRACE_GRAPH_RETURN 0x02 #define TRACE_FNAME_LEN 36 #define TRACE_THREAD_LEN MAX_THREAD_NAME_LEN struct trace_graph_item { char tname[TRACE_THREAD_LEN]; int type; char fname[TRACE_FNAME_LEN]; int depth; uint64_t entry_time; uint64_t return_time; }; #else /* * Some functions e.g. trace_buffer_push() can declare a pointer of struct * trace_graph_item in its parameters, so we need the below empty * declaration. */ struct trace_graph_item; #endif /* HAVE_TRACE */ #endif /* __INTERNAL_PROTO_H__ */ sheepdog-0.8.3/include/list.h000066400000000000000000000134661237656255000161140ustar00rootroot00000000000000#ifndef __LIST_H__ #define __LIST_H__ /* taken from linux kernel */ #include #include "compiler.h" struct list_node { struct list_node *next; struct list_node *prev; }; struct list_head { struct list_node n; }; #define LIST_HEAD_INIT(name) { { &(name.n), &(name.n) } } #define LIST_NODE_INIT { NULL, NULL } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) #define LIST_NODE(name) \ struct list_node name = LIST_NODE_INIT static inline void INIT_LIST_HEAD(struct list_head *list) { list->n.next = &list->n; list->n.prev = &list->n; } static inline void INIT_LIST_NODE(struct list_node *list) { list->next = NULL; list->prev = NULL; } #define list_first_entry(head, type, member) \ list_entry((head)->n.next, type, member) static inline bool list_empty(const struct list_head *head) { return head->n.next == &head->n; } static inline bool list_linked(const struct list_node *node) { return node->next != NULL; } #define list_entry(ptr, type, member) \ container_of(ptr, type, member) #define list_for_each(pos, head) \ for (typeof(pos) LOCAL(n) = (pos = (head)->n.next, pos->next); \ pos != &(head)->n; \ pos = LOCAL(n), LOCAL(n) = pos->next) #define list_for_each_entry(pos, head, member) \ for (typeof(pos) LOCAL(n) = (pos = list_entry((head)->n.next, \ typeof(*pos), \ member), \ list_entry(pos->member.next, \ typeof(*pos), \ member)); \ &pos->member != &(head)->n; \ pos = LOCAL(n), LOCAL(n) = list_entry(LOCAL(n)->member.next, \ typeof(*LOCAL(n)), \ member)) static inline void __list_add(struct list_node *new, struct list_node *prev, struct list_node *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } static inline void list_add(struct list_node *new, struct list_head *head) { __list_add(new, &head->n, head->n.next); } static inline void list_add_tail(struct list_node *new, struct list_head *head) { __list_add(new, head->n.prev, &head->n); } static inline void __list_del(struct list_node *prev, struct list_node *next) { next->prev = prev; prev->next = next; } static inline void __list_del_entry(struct list_node *entry) { __list_del(entry->prev, entry->next); } static inline void list_del(struct list_node *entry) { __list_del(entry->prev, entry->next); entry->next = entry->prev = NULL; } static inline void list_move(struct list_node *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } static inline void list_move_tail(struct list_node *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } static inline void __list_splice(const struct list_head *list, struct list_node *prev, struct list_node *next) { struct list_node *first = list->n.next; struct list_node *last = list->n.prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, &head->n, head->n.next); INIT_LIST_HEAD(list); } } static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->n.prev, &head->n); INIT_LIST_HEAD(list); } } /* hlist, mostly useful for hash tables */ #define LIST_POISON1 ((void *) 0x00100100) #define LIST_POISON2 ((void *) 0x00200200) struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline bool hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline bool hlist_empty(const struct hlist_head *h) { return !h->first; } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; *pprev = next; if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) { next->next = n->next; n->next = next; next->pprev = &n->next; if (next->next) next->next->pprev = &next->next; } #define hlist_entry(ptr, type, member) container_of(ptr, type, member) #define hlist_for_each(pos, head) \ for (typeof(pos) LOCAL(n) = (pos = (head)->first, NULL); \ pos && (LOCAL(n) = pos->next, 1); \ pos = LOCAL(n)) \ /* * hlist_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(tpos, pos, head, member) \ for (typeof(pos) LOCAL(n) = (pos = (head)->first, NULL); \ pos && (LOCAL(n) = pos->next, 1) && \ (tpos = hlist_entry(pos, typeof(*tpos), member), 1); \ pos = LOCAL(n)) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_node *a, struct list_node *b)); #endif /* __LIST_H__ */ sheepdog-0.8.3/include/logger.h000066400000000000000000000062341237656255000164130ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on log.h from Linux target framework (tgt). * Copyright (C) 2004 Dmitry Yusupov, Alex Aizman */ #ifndef LOGGER_H #define LOGGER_H #include #include #include "compiler.h" #define LOG_SPACE_SIZE (1 * 1024 * 1024) #define LOG_SPACE_DEBUG_SIZE (32 * 1024 * 1024) #define MAX_MSG_SIZE 1024 #define MAX_THREAD_NAME_LEN 20 struct logger_user_info { int port; }; extern int sd_log_level; enum log_dst_type { LOG_DST_DEFAULT, LOG_DST_STDOUT, LOG_DST_SYSLOG, }; void early_log_init(const char *format_name, struct logger_user_info *user_info); int log_init(const char *progname, enum log_dst_type type, int level, char *outfile); void log_close(void); void dump_logmsg(void *); void log_write(int prio, const char *func, int line, const char *fmt, ...) __printf(4, 5); void set_thread_name(const char *name, bool show_idx); void get_thread_name(char *name); #define sd_dump_variable(var) ({ \ __sd_dump_variable(#var); \ }) int __sd_dump_variable(const char *var); void sd_backtrace(void); /* sheep log priorities, comliant with syslog spec */ #define SDOG_EMERG LOG_EMERG #define SDOG_ALERT LOG_ALERT #define SDOG_CRIT LOG_CRIT #define SDOG_ERR LOG_ERR #define SDOG_WARNING LOG_WARNING #define SDOG_NOTICE LOG_NOTICE #define SDOG_INFO LOG_INFO #define SDOG_DEBUG LOG_DEBUG #define sd_emerg(fmt, args...) \ log_write(SDOG_EMERG, __func__, __LINE__, fmt, ##args) #define sd_alert(fmt, args...) \ log_write(SDOG_ALERT, __func__, __LINE__, fmt, ##args) #define sd_crit(fmt, args...) \ log_write(SDOG_CRIT, __func__, __LINE__, fmt, ##args) #define sd_err(fmt, args...) \ log_write(SDOG_ERR, __func__, __LINE__, fmt, ##args) #define sd_warn(fmt, args...) \ log_write(SDOG_WARNING, __func__, __LINE__, fmt, ##args) #define sd_notice(fmt, args...) \ log_write(SDOG_NOTICE, __func__, __LINE__, fmt, ##args) #define sd_info(fmt, args...) \ log_write(SDOG_INFO, __func__, __LINE__, fmt, ##args) /* * 'args' must not contain an operation/function with a side-effect. It won't * be evaluated when the log level is not SDOG_DEBUG. */ #define sd_debug(fmt, args...) \ ({ \ if (unlikely(sd_log_level == SDOG_DEBUG)) \ log_write(SDOG_DEBUG, __func__, __LINE__, fmt, ##args); \ }) #define panic(fmt, args...) \ ({ \ sd_emerg("PANIC: " fmt, ##args); \ abort(); \ }) static inline int loglevel_str2num(const char *str) { static const char * const loglevel_table[] = { "emerg", "alert", "crit", "err", "warning", "notice", "info", "debug", }; int i, max = ARRAY_SIZE(loglevel_table); for (i = 0; i < max; i++) { if (!strcmp(loglevel_table[i], str)) break; } return i == max ? -1 : i; } void set_loglevel(int new_loglevel); int get_loglevel(void); extern pid_t logger_pid; #endif /* LOG_H */ sheepdog-0.8.3/include/net.h000066400000000000000000000044671237656255000157300ustar00rootroot00000000000000#ifndef __NET_H__ #define __NET_H__ #include #include #include "sheepdog_proto.h" /* * We can't always retry because if only IO NIC is down, we'll retry for ever. * * We observed that for a busy node, the response could be as long as 15s, so * wait 30s would be a safe value. Even we are false timeouted, the gateway will * retry the request and sockfd cache module will repair the false-closes. */ #define MAX_POLLTIME 30 /* seconds */ #define POLL_TIMEOUT 5 /* seconds */ #define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT) enum conn_state { C_IO_HEADER = 0, C_IO_DATA_INIT, C_IO_DATA, C_IO_END, C_IO_CLOSED, }; struct connection { int fd; unsigned int events; uint16_t port; char ipstr[INET6_ADDRSTRLEN]; bool dead; }; int conn_tx_off(struct connection *conn); int conn_tx_on(struct connection *conn); int conn_rx_off(struct connection *conn); int conn_rx_on(struct connection *conn); int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int rx(struct connection *conn, enum conn_state next_state); int tx(struct connection *conn, enum conn_state next_state); int connect_to(const char *name, int port); int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int exec_req(int sockfd, struct sd_req *hdr, void *, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int create_listen_ports(const char *bindaddr, int port, int (*callback)(int fd, void *), void *data); int create_unix_domain_socket(const char *unix_path, int (*callback)(int, void *), void *data); const char *addr_to_str(const uint8_t *addr, uint16_t port); uint8_t *str_to_addr(const char *ipstr, uint8_t *addr); char *sockaddr_in_to_str(struct sockaddr_in *sockaddr); int set_nodelay(int fd); int set_keepalive(int fd); int set_snd_timeout(int fd); int set_rcv_timeout(int fd); int get_local_addr(uint8_t *bytes); bool inetaddr_is_valid(char *addr); int do_writev2(int fd, void *hdr, size_t hdr_len, void *body, size_t body_len); /* for typical usage of do_writev2() */ #define writev2(fd, hdr, body, body_len) \ do_writev2(fd, hdr, sizeof(*hdr), body, body_len) static inline int connect_to_addr(const uint8_t *addr, int port) { return connect_to(addr_to_str(addr, 0), port); } #endif sheepdog-0.8.3/include/option.h000066400000000000000000000021241237656255000164360ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SD_OPTION_H__ #define __SD_OPTION_H__ #include #include struct sd_option { int ch; const char *name; bool has_arg; const char *desc; const char *help; }; struct option_parser { const char *option; int (*parser)(const char *); }; char *build_short_options(const struct sd_option *opts); struct option *build_long_options(const struct sd_option *opts); const char *option_get_help(const struct sd_option *, int); int option_parse(char *arg, const char *delim, struct option_parser *parsers); int option_parse_size(const char *value, uint64_t *ret); #define sd_for_each_option(opt, opts) \ for (opt = (opts); opt->name; opt++) #endif /* __SD_OPTION_H__ */ sheepdog-0.8.3/include/rbtree.h000066400000000000000000000151651237656255000164220ustar00rootroot00000000000000#ifndef __RBTREE_H_ #define __RBTREE_H_ #include "compiler.h" /* We have to be 64-bytes aligned to get 32/64 bits compatibility */ struct rb_node { unsigned long rb_parent_color __attribute__ ((aligned (8))); #define RB_RED 0 #define RB_BLACK 1 struct rb_node *rb_right __attribute__ ((aligned (8))); struct rb_node *rb_left __attribute__ ((aligned (8))); }; struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) #define rb_color(r) ((r)->rb_parent_color & 1) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) rb_color(r) #define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) #define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; } static inline void rb_set_color(struct rb_node *rb, int color) { rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; } #define RB_ROOT { .rb_node = NULL } static inline void INIT_RB_ROOT(struct rb_root *root) { root->rb_node = NULL; } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) #define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) static inline void rb_init_node(struct rb_node *rb) { rb->rb_parent_color = 0; rb->rb_right = NULL; rb->rb_left = NULL; RB_CLEAR_NODE(rb); } void rb_insert_color(struct rb_node *, struct rb_root *); void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ struct rb_node *rb_next(const struct rb_node *); struct rb_node *rb_prev(const struct rb_node *); struct rb_node *rb_first(const struct rb_root *); struct rb_node *rb_last(const struct rb_root *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } /* * Search for a value in the rbtree. This returns NULL when the key is not * found in the rbtree. */ #define rb_search(root, key, member, compar) \ ({ \ struct rb_node *__n = (root)->rb_node; \ typeof(key) __ret = NULL, __data; \ \ while (__n) { \ __data = rb_entry(__n, typeof(*key), member); \ int __cmp = compar(key, __data); \ \ if (__cmp < 0) \ __n = __n->rb_left; \ else if (__cmp > 0) \ __n = __n->rb_right; \ else { \ __ret = __data; \ break; \ } \ } \ __ret; \ }) /* * Insert a new node into the rbtree. This returns NULL on success, or the * existing node on error. */ #define rb_insert(root, new, member, compar) \ ({ \ struct rb_node **__n = &(root)->rb_node, *__parent = NULL; \ typeof(new) __old = NULL, __data; \ \ while (*__n) { \ __data = rb_entry(*__n, typeof(*new), member); \ int __cmp = compar(new, __data); \ \ __parent = *__n; \ if (__cmp < 0) \ __n = &((*__n)->rb_left); \ else if (__cmp > 0) \ __n = &((*__n)->rb_right); \ else { \ __old = __data; \ break; \ } \ } \ \ if (__old == NULL) { \ /* Add new node and rebalance tree. */ \ rb_link_node(&((new)->member), __parent, __n); \ rb_insert_color(&((new)->member), root); \ } \ \ __old; \ }) /* * Search for a value in the rbtree. When the key is not found in the rbtree, * this returns the next greater node. Note, if key > greatest node, we'll * return first node. * * For an empty tree, we return NULL. */ #define rb_nsearch(root, key, member, compar) \ ({ \ struct rb_node *__n = (root)->rb_node; \ typeof(key) __ret = NULL, __data; \ \ while (__n) { \ __data = rb_entry(__n, typeof(*key), member); \ int __cmp = compar(key, __data); \ \ if (__cmp < 0) { \ __ret = __data; \ __n = __n->rb_left; \ } else if (__cmp > 0) \ __n = __n->rb_right; \ else { \ __ret = __data; \ break; \ } \ } \ if (!__ret && !RB_EMPTY_ROOT(root)) \ __ret = rb_entry(rb_first(root), typeof(*key), member); \ __ret; \ }) /* Iterate over a rbtree safe against removal of rbnode */ #define rb_for_each(pos, root) \ for (struct rb_node *LOCAL(n) = (pos = rb_first(root), NULL); \ pos && (LOCAL(n) = rb_next(pos), 1); \ pos = LOCAL(n)) /* Iterate over a rbtree of given type safe against removal of rbnode */ #define rb_for_each_entry(pos, root, member) \ for (struct rb_node *LOCAL(p) = rb_first(root), *LOCAL(n); \ LOCAL(p) && (LOCAL(n) = rb_next(LOCAL(p)), 1) && \ (pos = rb_entry(LOCAL(p), typeof(*pos), member), 1); \ LOCAL(p) = LOCAL(n)) /* Destroy the tree and free the memory */ #define rb_destroy(root, type, member) \ ({ \ type *__dummy; \ rb_for_each_entry(__dummy, root, member) { \ rb_erase(&__dummy->member, root); \ free(__dummy); \ } \ }) /* Copy the tree 'root' as 'outroot' */ #define rb_copy(root, type, member, outroot, compar) \ ({ \ type *__src, *__dst; \ rb_for_each_entry(__src, root, member) { \ __dst = xmalloc(sizeof(*__dst)); \ *__dst = *__src; \ rb_insert(outroot, __dst, member, compar); \ } \ }) #endif /* __RBTREE_H_ */ sheepdog-0.8.3/include/sha1.h000066400000000000000000000017071237656255000157700ustar00rootroot00000000000000/* * sha1.h - SHA1 Secure Hash Algorithm used for CHAP authentication. * copied from the Linux kernel's Cryptographic API and slightly adjusted to * fit IET's needs * * This file is (c) 2004 Xiranet Communications GmbH * and licensed under the GPL. */ #ifndef SHA1_H #define SHA1_H #include #include #include #define SHA1_DIGEST_SIZE 20 #define SHA1_BLOCK_SIZE 64 struct sha1_ctx { uint64_t count; uint32_t state[SHA1_DIGEST_SIZE / 4]; uint8_t buffer[SHA1_BLOCK_SIZE]; }; typedef void (*sha1_init_func_t)(void *); typedef void (*sha1_update_func_t)(void *, const uint8_t *, unsigned int); typedef void (*sha1_final_func_t)(void *, uint8_t *); sha1_init_func_t sha1_init; sha1_update_func_t sha1_update; sha1_final_func_t sha1_final; const char *sha1_to_hex(const unsigned char *sha1); void get_buffer_sha1(unsigned char *buf, unsigned len, unsigned char *sha1); #endif sheepdog-0.8.3/include/sheep.h000066400000000000000000000170471237656255000162440ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2012-2013 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SHEEP_H__ #define __SHEEP_H__ #include #include "internal_proto.h" #include "util.h" #include "bitops.h" #include "list.h" #include "net.h" #include "rbtree.h" struct sd_vnode { struct rb_node rb; const struct sd_node *node; uint64_t hash; }; struct vnode_info { struct rb_root vroot; struct rb_root nroot; int nr_nodes; int nr_zones; refcnt_t refcnt; }; static inline void sd_init_req(struct sd_req *req, uint8_t opcode) { memset(req, 0, sizeof(*req)); req->opcode = opcode; req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER; } static inline int same_zone(const struct sd_vnode *v1, const struct sd_vnode *v2) { return v1->node->zone == v2->node->zone; } static inline int vnode_cmp(const struct sd_vnode *node1, const struct sd_vnode *node2) { return intcmp(node1->hash, node2->hash); } /* If v1_hash < oid_hash <= v2_hash, then oid is resident on v2 */ static inline struct sd_vnode * oid_to_first_vnode(uint64_t oid, struct rb_root *root) { struct sd_vnode dummy = { .hash = sd_hash_oid(oid), }; return rb_nsearch(root, &dummy, rb, vnode_cmp); } /* Replica are placed along the ring one by one with different zones */ static inline void oid_to_vnodes(uint64_t oid, struct rb_root *root, int nr_copies, const struct sd_vnode **vnodes) { const struct sd_vnode *next = oid_to_first_vnode(oid, root); vnodes[0] = next; for (int i = 1; i < nr_copies; i++) { next: next = rb_entry(rb_next(&next->rb), struct sd_vnode, rb); if (!next) /* Wrap around */ next = rb_entry(rb_first(root), struct sd_vnode, rb); if (unlikely(next == vnodes[0])) panic("can't find a valid vnode"); for (int j = 0; j < i; j++) if (same_zone(vnodes[j], next)) goto next; vnodes[i] = next; } } static inline const struct sd_vnode * oid_to_vnode(uint64_t oid, struct rb_root *root, int copy_idx) { const struct sd_vnode *vnodes[SD_MAX_COPIES]; oid_to_vnodes(oid, root, copy_idx + 1, vnodes); return vnodes[copy_idx]; } static inline const struct sd_node * oid_to_node(uint64_t oid, struct rb_root *root, int copy_idx) { const struct sd_vnode *vnode; vnode = oid_to_vnode(oid, root, copy_idx); return vnode->node; } static inline void oid_to_nodes(uint64_t oid, struct rb_root *root, int nr_copies, const struct sd_node **nodes) { const struct sd_vnode *vnodes[SD_MAX_COPIES]; oid_to_vnodes(oid, root, nr_copies, vnodes); for (int i = 0; i < nr_copies; i++) nodes[i] = vnodes[i]->node; } static inline const char *sd_strerror(int err) { static const char *descs[256] = { /* from sheepdog_proto.h */ [SD_RES_SUCCESS] = "Success", [SD_RES_UNKNOWN] = "Unknown error", [SD_RES_NO_OBJ] = "No object found", [SD_RES_EIO] = "I/O error", [SD_RES_VDI_EXIST] = "VDI exists already", [SD_RES_INVALID_PARMS] = "Invalid parameters", [SD_RES_SYSTEM_ERROR] = "System error", [SD_RES_VDI_LOCKED] = "VDI is already locked", [SD_RES_NO_VDI] = "No VDI found", [SD_RES_NO_BASE_VDI] = "No base VDI found", [SD_RES_VDI_READ] = "Failed to read from requested VDI", [SD_RES_VDI_WRITE] = "Failed to write to requested VDI", [SD_RES_BASE_VDI_READ] = "Failed to read from base VDI", [SD_RES_BASE_VDI_WRITE] = "Failed to write to base VDI", [SD_RES_NO_TAG] = "Failed to find requested tag", [SD_RES_STARTUP] = "System is still booting", [SD_RES_VDI_NOT_LOCKED] = "VDI is not locked", [SD_RES_SHUTDOWN] = "System is shutting down", [SD_RES_NO_MEM] = "Out of memory on server", [SD_RES_FULL_VDI] = "Maximum number of VDIs reached", [SD_RES_VER_MISMATCH] = "Protocol version mismatch", [SD_RES_NO_SPACE] = "Server has no space for new objects", [SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be formatted", [SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join cluster", [SD_RES_JOIN_FAILED] = "Node has failed to join cluster", [SD_RES_HALT] = "IO has halted as there are not enough living nodes", [SD_RES_READONLY] = "Object is read-only", /* from internal_proto.h */ [SD_RES_OLD_NODE_VER] = "Request has an old epoch", [SD_RES_NEW_NODE_VER] = "Request has a new epoch", [SD_RES_NOT_FORMATTED] = "Cluster has not been formatted", [SD_RES_INVALID_CTIME] = "Creation times differ", [SD_RES_INVALID_EPOCH] = "Invalid epoch", [SD_RES_NETWORK_ERROR] = "Network error between sheep", [SD_RES_NO_CACHE] = "No cache object found", [SD_RES_BUFFER_SMALL] = "The buffer is too small", [SD_RES_FORCE_RECOVER] = "Cluster is running/halted and cannot be force recovered", [SD_RES_NO_STORE] = "Targeted backend store is not found", [SD_RES_NO_SUPPORT] = "Operation is not supported", [SD_RES_NODE_IN_RECOVERY] = "Targeted node is in recovery", [SD_RES_KILLED] = "Node is killed", [SD_RES_OID_EXIST] = "Object ID exists already", [SD_RES_AGAIN] = "Ask to try again", [SD_RES_STALE_OBJ] = "Object may be stale", [SD_RES_CLUSTER_ERROR] = "Cluster driver error", }; if (!(0 <= err && err < ARRAY_SIZE(descs)) || descs[err] == NULL) { static __thread char msg[32]; snprintf(msg, sizeof(msg), "Invalid error code %x", err); return msg; } return descs[err]; } static inline int oid_cmp(const uint64_t *oid1, const uint64_t *oid2) { return intcmp(*oid1, *oid2); } static inline int node_id_cmp(const struct node_id *node1, const struct node_id *node2) { int cmp = memcmp(node1->addr, node2->addr, sizeof(node1->addr)); if (cmp != 0) return cmp; return intcmp(node1->port, node2->port); } static inline int node_cmp(const struct sd_node *node1, const struct sd_node *node2) { return node_id_cmp(&node1->nid, &node2->nid); } static inline bool node_eq(const struct sd_node *a, const struct sd_node *b) { return node_cmp(a, b) == 0; } static inline void node_to_vnodes(const struct sd_node *n, struct rb_root *vroot) { uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr)); for (int i = 0; i < n->nr_vnodes; i++) { struct sd_vnode *v = xmalloc(sizeof(*v)); hval = sd_hash_next(hval); v->hash = hval; v->node = n; if (unlikely(rb_insert(vroot, v, rb, vnode_cmp))) panic("vdisk hash collison"); } } static inline void nodes_to_vnodes(struct rb_root *nroot, struct rb_root *vroot) { struct sd_node *n; rb_for_each_entry(n, nroot, rb) node_to_vnodes(n, vroot); } static inline void nodes_to_buffer(struct rb_root *nroot, void *buffer) { struct sd_node *n, *buf = buffer; rb_for_each_entry(n, nroot, rb) { memcpy(buf++, n, sizeof(*n)); } } #define MAX_NODE_STR_LEN 256 static inline const char *node_to_str(const struct sd_node *id) { static __thread char str[MAX_NODE_STR_LEN]; int af = AF_INET6; const uint8_t *addr = id->nid.addr; /* Find address family type */ if (addr[12]) { int oct_no = 0; while (!addr[oct_no] && oct_no++ < 12) ; if (oct_no == 12) af = AF_INET; } snprintf(str, sizeof(str), "%s ip:%s port:%d", (af == AF_INET) ? "IPv4" : "IPv6", addr_to_str(id->nid.addr, 0), id->nid.port); return str; } static inline struct sd_node *str_to_node(const char *str, struct sd_node *id) { int port; char v[8], ip[MAX_NODE_STR_LEN]; sscanf(str, "%s ip:%s port:%d", v, ip, &port); id->nid.port = port; if (!str_to_addr(ip, id->nid.addr)) return NULL; return id; } #endif sheepdog-0.8.3/include/sheepdog_proto.h000066400000000000000000000310241237656255000201500ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SHEEPDOG_PROTO_H__ #define __SHEEPDOG_PROTO_H__ #include #include #include #include #include #include "compiler.h" #include "bitops.h" #define SD_PROTO_VER 0x02 /* This or later version supports trimming zero sectors from read response */ #define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02 #define SD_LISTEN_PORT 7000 #define SD_OP_CREATE_AND_WRITE_OBJ 0x01 #define SD_OP_READ_OBJ 0x02 #define SD_OP_WRITE_OBJ 0x03 #define SD_OP_REMOVE_OBJ 0x04 #define SD_OP_DISCARD_OBJ 0x05 #define SD_OP_NEW_VDI 0x11 #define SD_OP_LOCK_VDI 0x12 #define SD_OP_RELEASE_VDI 0x13 #define SD_OP_GET_VDI_INFO 0x14 #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 #define SD_OP_DEL_VDI 0x17 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 #define SD_FLAG_CMD_CACHE 0x04 #define SD_FLAG_CMD_DIRECT 0x08 /* don't use object cache */ /* flags above 0x80 are sheepdog-internal */ #define SD_RES_SUCCESS 0x00 /* Success */ #define SD_RES_UNKNOWN 0x01 /* Unknown error */ #define SD_RES_NO_OBJ 0x02 /* No object found */ #define SD_RES_EIO 0x03 /* I/O error */ #define SD_RES_VDI_EXIST 0x04 /* VDI exists already */ #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ #define SD_RES_SYSTEM_ERROR 0x06 /* System error */ #define SD_RES_VDI_LOCKED 0x07 /* VDI is locked */ #define SD_RES_NO_VDI 0x08 /* No VDI found */ #define SD_RES_NO_BASE_VDI 0x09 /* No base VDI found */ #define SD_RES_VDI_READ 0x0A /* Cannot read requested VDI */ #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested VDI */ #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base VDI */ #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base VDI */ #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ #define SD_RES_VDI_NOT_LOCKED 0x10 /* VDI is not locked */ #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ #define SD_RES_FULL_VDI 0x13 /* we already have the maximum VDIs */ #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */ #define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */ #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */ #define SD_RES_READONLY 0x1A /* Object is read-only */ #define SD_RES_BTREE_NOT_FOUND 0x1B /* Cannot found node in btree */ #define SD_RES_BTREE_FOUND 0x1C /* Found node in btree */ #define SD_RES_BTREE_REPEAT 0x1D /* Should repeat op in btree */ /* errors above 0x80 are sheepdog-internal */ /* * Object ID rules * * 0 - 31 (32 bits): data object space * 32 - 55 (24 bits): VDI object space * 56 - 59 ( 4 bits): reserved VDI object space * 60 - 63 ( 4 bits): object type indentifier space */ #define VDI_SPACE_SHIFT 32 #define SD_VDI_MASK 0x00FFFFFF00000000 #define VDI_BIT (UINT64_C(1) << 63) #define VMSTATE_BIT (UINT64_C(1) << 62) #define VDI_ATTR_BIT (UINT64_C(1) << 61) #define VDI_BTREE_BIT (UINT64_C(1) << 60) #define OLD_MAX_DATA_OBJS (1ULL << 20) #define MAX_DATA_OBJS (1ULL << 32) #define MAX_CHILDREN 1024U #define SD_MAX_VDI_LEN 256U #define SD_MAX_VDI_TAG_LEN 256U #define SD_MAX_VDI_ATTR_KEY_LEN 256U #define SD_MAX_VDI_ATTR_VALUE_LEN 65536U #define SD_MAX_SNAPSHOT_TAG_LEN 256U #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_OLD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * OLD_MAX_DATA_OBJS) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) #define SD_INODE_SIZE (sizeof(struct sd_inode)) #define SD_INODE_INDEX_SIZE (sizeof(uint32_t) * MAX_DATA_OBJS) #define SD_INODE_DATA_INDEX (1ULL << 20) #define SD_INODE_DATA_INDEX_SIZE (sizeof(uint32_t) * SD_INODE_DATA_INDEX) #define SD_INODE_HEADER_SIZE offsetof(struct sd_inode, data_vdi_id) #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr)) #define CURRENT_VDI_ID 0 #define STORE_LEN 16 #define SD_REQ_SIZE 48 #define SD_RSP_SIZE 48 struct sd_req { uint8_t proto_ver; uint8_t opcode; uint16_t flags; uint32_t epoch; uint32_t id; uint32_t data_length; union { struct { uint64_t oid; uint64_t cow_oid; uint8_t copies; uint8_t copy_policy; uint8_t ec_index; uint8_t reserved; uint32_t tgt_epoch; uint32_t offset; uint32_t __pad; } obj; struct { uint64_t vdi_size; uint32_t base_vdi_id; uint8_t copies; uint8_t copy_policy; uint8_t store_policy; uint8_t reserved; uint32_t snapid; } vdi; /* sheepdog-internal */ struct { uint64_t oid; uint64_t ctime; uint8_t copies; uint8_t copy_policy; uint16_t flags; uint32_t tag; } cluster; struct { uint32_t old_vid; uint32_t new_vid; uint8_t copies; uint8_t set_bitmap; /* 0 means false */ /* others mean true */ uint8_t copy_policy; } vdi_state; uint32_t __pad[8]; }; }; struct sd_rsp { uint8_t proto_ver; uint8_t opcode; uint16_t flags; uint32_t epoch; uint32_t id; uint32_t data_length; union { uint32_t result; struct { uint32_t __pad; uint8_t copies; uint8_t reserved[3]; uint64_t offset; } obj; struct { uint32_t __pad; uint32_t rsvd; uint32_t vdi_id; uint32_t attr_id; uint8_t copies; uint8_t reserved[3]; } vdi; /* sheepdog-internal */ struct { uint32_t __pad; uint32_t nr_nodes; uint32_t __reserved[2]; uint64_t store_size; uint64_t store_free; } node; struct { uint32_t __pad1; uint32_t __pad2; uint8_t digest[20]; } hash; uint32_t __pad[8]; }; }; struct sd_inode { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; uint64_t create_time; uint64_t snap_ctime; uint64_t vm_clock_nsec; uint64_t vdi_size; uint64_t vm_state_size; uint8_t copy_policy; uint8_t store_policy; uint8_t nr_copies; uint8_t block_size_shift; uint32_t snap_id; uint32_t vdi_id; uint32_t parent_vdi_id; uint32_t child_vdi_id[MAX_CHILDREN]; uint32_t data_vdi_id[SD_INODE_DATA_INDEX]; uint32_t btree_counter; }; struct sd_extent { uint32_t idx; uint32_t vdi_id; }; struct sd_extent_idx { uint32_t idx; uint64_t oid; }; #define INODE_BTREE_MAGIC 0x6274 struct sd_extent_header { uint16_t magic; uint16_t depth; /* 1 -- ext node; 2 -- idx node */ uint32_t entries; }; enum btree_node_type { BTREE_HEAD = 1, BTREE_EXT, BTREE_IDX, }; typedef int (*write_node_fn)(uint64_t id, void *mem, unsigned int len, uint64_t offset, uint32_t flags, int copies, int copy_policy, bool create, bool direct); typedef int (*read_node_fn)(uint64_t id, void **mem, unsigned int len, uint64_t offset); struct sheepdog_vdi_attr { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; uint64_t ctime; uint32_t snap_id; uint32_t value_len; char key[SD_MAX_VDI_ATTR_KEY_LEN]; char value[SD_MAX_VDI_ATTR_VALUE_LEN]; }; extern void sd_inode_init(void *data, int depth); extern uint32_t sd_inode_get_vid(read_node_fn reader, const struct sd_inode *inode, uint32_t idx); extern void sd_inode_set_vid(write_node_fn writer, read_node_fn reader, struct sd_inode *inode, uint32_t idx, uint32_t vdi_id); extern int sd_inode_write(write_node_fn writer, struct sd_inode *inode, int flags, bool create, bool direct); extern int sd_inode_write_vid(write_node_fn writer, struct sd_inode *inode, uint32_t idx, uint32_t vid, uint32_t value, int flags, bool create, bool direct); extern uint32_t sd_inode_get_meta_size(struct sd_inode *inode, size_t size); extern void sd_inode_copy_vdis(write_node_fn writer, read_node_fn reader, uint32_t *data_vdi_id, uint8_t store_policy, uint8_t nr_copies, uint8_t copy_policy, struct sd_inode *newi); typedef void (*btree_cb_fn)(void *data, enum btree_node_type type, void *arg); extern void traverse_btree(read_node_fn reader, const struct sd_inode *inode, btree_cb_fn fn, void *arg); /* 64 bit FNV-1a non-zero initial basis */ #define FNV1A_64_INIT ((uint64_t) 0xcbf29ce484222325ULL) #define FNV_64_PRIME ((uint64_t) 0x100000001b3ULL) /* 64 bit Fowler/Noll/Vo FNV-1a hash code */ static inline uint64_t fnv_64a_buf(const void *buf, size_t len, uint64_t hval) { const unsigned char *p = (const unsigned char *) buf; for (int i = 0; i < len; i++) { hval ^= (uint64_t) p[i]; hval *= FNV_64_PRIME; } return hval; } /* * The result is same as fnv_64a_buf(&oid, sizeof(oid), hval) but this function * is a bit faster. */ static inline uint64_t fnv_64a_64(uint64_t oid, uint64_t hval) { hval ^= oid & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 8 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 16 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 24 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 32 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 40 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 48 & 0xff; hval *= FNV_64_PRIME; hval ^= oid >> 56 & 0xff; hval *= FNV_64_PRIME; return hval; } static inline uint64_t sd_hash(const void *buf, size_t len) { uint64_t hval = fnv_64a_buf(buf, len, FNV1A_64_INIT); return fnv_64a_64(hval, hval); } static inline uint64_t sd_hash_64(uint64_t oid) { uint64_t hval = fnv_64a_64(oid, FNV1A_64_INIT); return fnv_64a_64(hval, hval); } static inline uint64_t sd_hash_next(uint64_t hval) { return fnv_64a_64(hval, hval); } /* * Create a hash value from an object id. The result is same as sd_hash(&oid, * sizeof(oid)) but this function is a bit faster. */ static inline uint64_t sd_hash_oid(uint64_t oid) { return sd_hash_64(oid); } /* * Create a hash value from a vdi name. We cannot use sd_hash_buf for this * purpose because of backward compatibility. */ static inline uint32_t sd_hash_vdi(const char *name) { uint64_t hval = fnv_64a_buf(name, strlen(name), FNV1A_64_INIT); return (uint32_t)(hval & (SD_NR_VDIS - 1)); } static inline uint64_t hash_64(uint64_t val, unsigned int bits) { return sd_hash_64(val) >> (64 - bits); } static inline bool is_vdi_obj(uint64_t oid) { return !!(oid & VDI_BIT); } static inline bool is_vmstate_obj(uint64_t oid) { return !!(oid & VMSTATE_BIT); } static inline bool is_vdi_attr_obj(uint64_t oid) { return !!(oid & VDI_ATTR_BIT); } static inline bool is_vdi_btree_obj(uint64_t oid) { return !!(oid & VDI_BTREE_BIT); } static inline bool is_data_obj(uint64_t oid) { return !is_vdi_obj(oid) && !is_vmstate_obj(oid) && !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid); } static inline size_t count_data_objs(const struct sd_inode *inode) { return DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); } static inline size_t get_objsize(uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (is_vdi_attr_obj(oid)) return SD_ATTR_OBJ_SIZE; if (is_vdi_btree_obj(oid)) return SD_INODE_DATA_INDEX_SIZE; return SD_DATA_OBJ_SIZE; } static inline uint64_t data_oid_to_idx(uint64_t oid) { return oid & (MAX_DATA_OBJS - 1); } static inline uint64_t vid_to_vdi_oid(uint32_t vid) { return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); } static inline uint64_t vid_to_data_oid(uint32_t vid, uint64_t idx) { return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; } static inline uint32_t oid_to_vid(uint64_t oid) { return (oid & SD_VDI_MASK) >> VDI_SPACE_SHIFT; } static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid) { return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid; } static inline uint64_t vid_to_btree_oid(uint32_t vid, uint32_t btreeid) { return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_BTREE_BIT | btreeid; } static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) { return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; } static inline bool vdi_is_snapshot(const struct sd_inode *inode) { return !!inode->snap_ctime; } static inline __attribute__((used)) void __sd_proto_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sd_req) != SD_REQ_SIZE); BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE); } #endif sheepdog-0.8.3/include/shepherd.h000066400000000000000000000053501237656255000167340ustar00rootroot00000000000000#ifndef SHEPHERD_H #define SHEPHERD_H enum sph_cli_msg_type { /* messages sent by a cluster driver, received by shepherd */ SPH_CLI_MSG_JOIN = 0, SPH_CLI_MSG_ACCEPT, SPH_CLI_MSG_NOTIFY, SPH_CLI_MSG_BLOCK, SPH_CLI_MSG_LEAVE, }; enum sph_srv_msg_type { /* messages sent by shepherd, received by a cluster driver */ SPH_SRV_MSG_JOIN_REPLY = 0, SPH_SRV_MSG_JOIN_RETRY, SPH_SRV_MSG_NEW_NODE, SPH_SRV_MSG_NEW_NODE_FINISH, SPH_SRV_MSG_NOTIFY_FORWARD, SPH_SRV_MSG_BLOCK_FORWARD, SPH_SRV_MSG_LEAVE_FORWARD, SPH_SRV_MSG_REMOVE, }; struct sph_msg { /* * original type of uint32_t type: * enum sph_cli_msg_type or enum sph_srv_msg_type */ uint32_t type; uint32_t body_len; }; #include "internal_proto.h" struct sph_msg_join { struct sd_node new_node; struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_join_reply { struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_join_node_finish { struct sd_node new_node; struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_notify { uint8_t unblock; uint8_t notify_msg[0]; }; struct sph_msg_notify_forward { struct sd_node from_node; uint8_t unblock; uint8_t notify_msg[0]; }; #define SHEPHERD_PORT 2501 static inline const char *sph_cli_msg_to_str(enum sph_cli_msg_type msg) /* CAUTION: non reentrant */ { int i; static char unknown[64]; static const struct { enum sph_cli_msg_type msg; const char *desc; } msgs[] = { { SPH_CLI_MSG_JOIN, "SPH_CLI_MSG_JOIN" }, { SPH_CLI_MSG_ACCEPT, "SPH_CLI_MSG_ACCEPT" }, { SPH_CLI_MSG_NOTIFY, "SPH_CLI_MSG_NOTIFY" }, { SPH_CLI_MSG_BLOCK, "SPH_CLI_MSG_BLOCK" }, { SPH_CLI_MSG_LEAVE, "SPH_CLI_MSG_LEAVE" }, }; for (i = 0; i < ARRAY_SIZE(msgs); i++) { if (msgs[i].msg == msg) return msgs[i].desc; } memset(unknown, 0, 64); snprintf(unknown, 64, "", msg); return unknown; } static inline const char *sph_srv_msg_to_str(enum sph_srv_msg_type msg) /* CAUTION: non reentrant */ { int i; static char unknown[64]; static const struct { enum sph_srv_msg_type msg; const char *desc; } msgs[] = { { SPH_SRV_MSG_JOIN_RETRY, "SPH_SRV_MSG_JOIN_RETRY" }, { SPH_SRV_MSG_NEW_NODE, "SPH_SRV_MSG_NEW_NODE" }, { SPH_SRV_MSG_NEW_NODE_FINISH, "SPH_SRV_MSG_NEW_NODE_FINISH" }, { SPH_SRV_MSG_NOTIFY_FORWARD, "SPH_SRV_MSG_NOTIFY_FORWARD" }, { SPH_SRV_MSG_BLOCK_FORWARD, "SPH_SRV_MSG_BLOCK_FORWARD" }, { SPH_SRV_MSG_REMOVE, "SPH_SRV_MSG_REMOVE" }, }; for (i = 0; i < ARRAY_SIZE(msgs); i++) { if (msgs[i].msg == msg) return msgs[i].desc; } memset(unknown, 0, 64); snprintf(unknown, 64, "", msg); return unknown; } #endif /* SHEPHERD_H */ sheepdog-0.8.3/include/sockfd_cache.h000066400000000000000000000010701237656255000175210ustar00rootroot00000000000000#ifndef SOCKFD_CACHE_H #define SOCKFD_CACHE_H #include "internal_proto.h" #include "work.h" struct sockfd *sockfd_cache_get(const struct node_id *nid); void sockfd_cache_put(const struct node_id *nid, struct sockfd *sfd); void sockfd_cache_del_node(const struct node_id *nid); void sockfd_cache_del(const struct node_id *nid, struct sockfd *sfd); void sockfd_cache_add(const struct node_id *nid); void sockfd_cache_add_group(const struct rb_root *nroot); int sockfd_init(void); /* sockfd_cache */ struct sockfd { int fd; int idx; }; #endif /* SOCKFD_CACHE_H */ sheepdog-0.8.3/include/strbuf.h000066400000000000000000000051201237656255000164320ustar00rootroot00000000000000#ifndef STRBUF_H #define STRBUF_H #include #include #include #include #include #include "util.h" struct strbuf { size_t alloc; size_t len; int eof; char *buf; }; #define alloc_nr(x) (((x)+16)*3/2) /* * Realloc the buffer pointed at by variable 'x' so that it can hold * at least 'nr' entries; the number of entries currently allocated * is 'alloc', using the standard growing factor alloc_nr() macro. * * DO NOT USE any expression with side-effect for 'x' or 'alloc'. */ #define ALLOC_GROW(x, nr, alloc) \ do { \ if ((nr) > alloc) { \ if (alloc_nr(alloc) < (nr)) \ alloc = (nr); \ else \ alloc = alloc_nr(alloc); \ x = xrealloc((x), alloc * sizeof(*(x))); \ } \ } while (0) #define STRBUF_INIT { 0, 0, 0, NULL } /*----- strbuf life cycle -----*/ void strbuf_init(struct strbuf *, size_t); void strbuf_release(struct strbuf *); void strbuf_reset(struct strbuf *); char *strbuf_detach(struct strbuf *); void strbuf_attach(struct strbuf *, void *, size_t, size_t); /*----- strbuf size related -----*/ static inline size_t strbuf_avail(struct strbuf *sb) { return sb->alloc ? sb->alloc - sb->len - 1 : 0; } static inline void strbuf_setlen(struct strbuf *sb, size_t len) { assert(len < sb->alloc); sb->len = len; sb->buf[len] = '\0'; } void strbuf_grow(struct strbuf *, size_t); /*----- content related -----*/ void strbuf_rtrim(struct strbuf *); /*----- add data in your buffer -----*/ static inline void strbuf_addch(struct strbuf *sb, int c) { strbuf_grow(sb, 1); sb->buf[sb->len++] = c; sb->buf[sb->len] = '\0'; } /* inserts after pos, or appends if pos >= sb->len */ void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t); void strbuf_remove(struct strbuf *, size_t pos, size_t len); /* splice pos..pos+len with given data */ void strbuf_splice(struct strbuf *, size_t pos, size_t len, const void *, size_t); void strbuf_add(struct strbuf *, const void *, size_t); static inline void strbuf_addstr(struct strbuf *sb, const char *s) { strbuf_add(sb, s, strlen(s)); } static inline void strbuf_addbuf(struct strbuf *sb, struct strbuf *sb2) { strbuf_add(sb, sb2->buf, sb2->len); } void strbuf_addf(struct strbuf *sb, const char *fmt, ...) __printf(2, 3); size_t strbuf_fread(struct strbuf *, size_t, FILE *); /* XXX: if read fails, any partial read is undone */ ssize_t strbuf_read(struct strbuf *, int fd, size_t hint); int strbuf_getline(struct strbuf *sb, FILE *fp, int term); int strbuf_copyout(struct strbuf *sb, void *buf, size_t len); int strbuf_stripout(struct strbuf *sb, void *buf, size_t len); #endif sheepdog-0.8.3/include/util.h000066400000000000000000000266521237656255000161170ustar00rootroot00000000000000#ifndef __UTIL_H__ #define __UTIL_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include "logger.h" #include "list.h" #include "compiler.h" #define SECTOR_SIZE (1U << 9) #define BLOCK_SIZE (1U << 12) #define round_up(x, y) roundup(x, y) #define round_down(x, y) (((x) / (y)) * (y)) #if __BYTE_ORDER == __LITTLE_ENDIAN #define __cpu_to_be16(x) bswap_16(x) #define __cpu_to_be32(x) bswap_32(x) #define __cpu_to_be64(x) bswap_64(x) #define __be16_to_cpu(x) bswap_16(x) #define __be32_to_cpu(x) bswap_32(x) #define __be64_to_cpu(x) bswap_64(x) #define __cpu_to_le32(x) (x) #else #define __cpu_to_be16(x) (x) #define __cpu_to_be32(x) (x) #define __cpu_to_be64(x) (x) #define __be16_to_cpu(x) (x) #define __be32_to_cpu(x) (x) #define __be64_to_cpu(x) (x) #define __cpu_to_le32(x) bswap_32(x) #endif #define uninitialized_var(x) x = x static inline int before(uint32_t seq1, uint32_t seq2) { return (int32_t)(seq1 - seq2) < 0; } static inline int after(uint32_t seq1, uint32_t seq2) { return (int32_t)(seq2 - seq1) < 0; } #define min(x, y) ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x < _y ? _x : _y; }) #define max(x, y) ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x > _y ? _x : _y; }) static inline void *zalloc(size_t size) { return calloc(1, size); } /* * Compares two integer values * * If the first argument is larger than the second one, intcmp() returns 1. If * two members are equal, returns 0. Otherwise, returns -1. */ #define intcmp(x, y) \ ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x < _y ? -1 : _x > _y ? 1 : 0; \ }) typedef void (*try_to_free_t)(size_t); try_to_free_t set_try_to_free_routine(try_to_free_t); void *xmalloc(size_t size); void *xzalloc(size_t size); void *xrealloc(void *ptr, size_t size); void *xcalloc(size_t nmemb, size_t size); void *xvalloc(size_t size); ssize_t xread(int fd, void *buf, size_t len); ssize_t xwrite(int fd, const void *buf, size_t len); ssize_t xpread(int fd, void *buf, size_t count, off_t offset); ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset); int xmkdir(const char *pathname, mode_t mode); int xfallocate(int fd, int mode, off_t offset, off_t len); int xftruncate(int fd, off_t length); int eventfd_xread(int efd); void eventfd_xwrite(int efd, int value); void pstrcpy(char *buf, int buf_size, const char *str); char *chomp(char *str); int rmdir_r(const char *dir_path); int purge_directory(const char *dir_path); bool is_numeric(const char *p); const char *data_to_str(void *data, size_t data_length); int install_sighandler(int signum, void (*handler)(int), bool once); int install_crash_handler(void (*handler)(int)); void reraise_crash_signal(int signo, int status); pid_t gettid(void); int tkill(int tid, int sig); bool is_xattr_enabled(const char *path); const char *my_exe_path(void); int split_path(const char *path, size_t nr_segs, char **segs); void make_path(char *path, size_t size, size_t nr_segs, const char **segs); int atomic_create_and_write(const char *path, const char *buf, size_t len, bool force_create); /* a type safe version of qsort() */ #define xqsort(base, nmemb, compar) \ ({ \ if (nmemb > 1) { \ qsort(base, nmemb, sizeof(*(base)), \ (comparison_fn_t)compar); \ assert(compar(base, base + 1) <= 0); \ } \ }) /* a type safe version of bsearch() */ #define xbsearch(key, base, nmemb, compar) \ ({ \ typeof(&(base)[0]) __ret = NULL; \ if (nmemb > 0) { \ assert(compar(key, key) == 0); \ assert(compar(base, base) == 0); \ __ret = bsearch(key, base, nmemb, sizeof(*(base)), \ (comparison_fn_t)compar); \ } \ __ret; \ }) /* a type safe version of lfind() */ #define xlfind(key, base, nmemb, compar) \ ({ \ typeof(&(base)[0]) __ret = NULL; \ if (nmemb > 0) { \ size_t __n = nmemb; \ assert(compar(key, key) == 0); \ assert(compar(base, base) == 0); \ __ret = lfind(key, base, &__n, sizeof(*(base)), \ (comparison_fn_t)compar); \ } \ __ret; \ }) /* * Search 'key' in the array 'base' linearly and remove it if it found. * * If 'key' is found in 'base', this function increments *nmemb and returns * true. */ #define xlremove(key, base, nmemb, compar) \ ({ \ bool __removed = false; \ typeof(&(base)[0]) __e; \ \ __e = xlfind(key, base, *(nmemb), compar); \ if (__e != NULL) { \ (*(nmemb))--; \ memmove(__e, __e + 1, \ sizeof(*(base)) * (*(nmemb) - (__e - (base)))); \ __removed = true; \ } \ __removed; \ }) #ifdef assert #error "Don't include assert.h, use util.h for assert()" #endif #ifndef NDEBUG #define assert(expr) \ ({ \ if (!(expr)) { \ sd_emerg("Asserting `%s' failed.", #expr); \ abort(); \ } \ }) #else #define assert(expr) ((void)0) #endif /* NDEBUG */ #define SWAP(a, b) { typeof(a) tmp; tmp = a; a = b; b = tmp; } /* urcu helpers */ /* Boolean data type which can be accessed by multiple threads */ typedef struct { unsigned long val; } uatomic_bool; static inline bool uatomic_is_true(uatomic_bool *val) { return uatomic_read(&val->val) == 1; } /* success if the old value is false */ static inline bool uatomic_set_true(uatomic_bool *val) { return uatomic_cmpxchg(&val->val, 0, 1) == 0; } static inline void uatomic_set_false(uatomic_bool *val) { uatomic_set(&val->val, 0); } /* * uatomic_xchg_ptr - uatomic_xchg for pointers * * Swaps the old value stored at location p with new value given by * val. Returns old value. */ #define uatomic_xchg_ptr(p, val) \ ({ \ uintptr_t ret; \ ret = uatomic_xchg((uintptr_t *)(p), (val)); \ (typeof(*(p)))ret; \ }) /* * refcnt_t: reference counter which can be manipulated by multiple threads * safely */ typedef struct { int val; } refcnt_t; static inline void refcount_set(refcnt_t *rc, int val) { uatomic_set(&rc->val, val); } static inline int refcount_read(refcnt_t *rc) { return uatomic_read(&rc->val); } static inline int refcount_inc(refcnt_t *rc) { return uatomic_add_return(&rc->val, 1); } static inline int refcount_dec(refcnt_t *rc) { assert(1 <= uatomic_read(&rc->val)); return uatomic_sub_return(&rc->val, 1); } /* wrapper for pthread_mutex */ #define SD_MUTEX_INITIALIZER { .mutex = PTHREAD_MUTEX_INITIALIZER } struct sd_mutex { pthread_mutex_t mutex; }; static inline void sd_init_mutex(struct sd_mutex *mutex) { int ret; do { ret = pthread_mutex_init(&mutex->mutex, NULL); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to initialize a lock, %s", strerror(ret)); } static inline void sd_init_mutex_attr(struct sd_mutex *mutex, pthread_mutexattr_t *attr) { int ret; do { ret = pthread_mutex_init(&mutex->mutex, attr); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to initialize a lock with attr, %s", strerror(ret)); } static inline void sd_destroy_mutex(struct sd_mutex *mutex) { int ret; do { ret = pthread_mutex_destroy(&mutex->mutex); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to destroy a lock, %s", strerror(ret)); } static inline void sd_mutex_lock(struct sd_mutex *mutex) { int ret; do { ret = pthread_mutex_lock(&mutex->mutex); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to lock for reading, %s", strerror(ret)); } static inline int sd_mutex_trylock(struct sd_mutex *mutex) { return pthread_mutex_trylock(&mutex->mutex); } static inline void sd_mutex_unlock(struct sd_mutex *mutex) { int ret; do { ret = pthread_mutex_unlock(&mutex->mutex); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to unlock, %s", strerror(ret)); } /* wrapper for pthread_cond */ #define SD_COND_INITIALIZER { .cond = PTHREAD_COND_INITIALIZER } struct sd_cond { pthread_cond_t cond; }; static inline void sd_cond_init(struct sd_cond *cond) { int ret; do { ret = pthread_cond_init(&cond->cond, NULL); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to initialize a lock, %s", strerror(ret)); } static inline void sd_destroy_cond(struct sd_cond *cond) { int ret; do { ret = pthread_cond_destroy(&cond->cond); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to destroy a lock, %s", strerror(ret)); } static inline int sd_cond_signal(struct sd_cond *cond) { return pthread_cond_signal(&cond->cond); } static inline int sd_cond_wait(struct sd_cond *cond, struct sd_mutex *mutex) { return pthread_cond_wait(&cond->cond, &mutex->mutex); } static inline int sd_cond_wait_timeout(struct sd_cond *cond, struct sd_mutex *mutex, int second) { struct timespec wait_time; wait_time.tv_sec = second; wait_time.tv_nsec = 0; return pthread_cond_timedwait(&cond->cond, &mutex->mutex, &wait_time); } static inline int sd_cond_broadcast(struct sd_cond *cond) { return pthread_cond_broadcast(&cond->cond); } /* wrapper for pthread_rwlock */ #define SD_RW_LOCK_INITIALIZER { .rwlock = PTHREAD_RWLOCK_INITIALIZER } struct sd_rw_lock { pthread_rwlock_t rwlock; }; static inline void sd_init_rw_lock(struct sd_rw_lock *lock) { int ret; do { ret = pthread_rwlock_init(&lock->rwlock, NULL); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to initialize a lock, %s", strerror(ret)); } static inline void sd_destroy_rw_lock(struct sd_rw_lock *lock) { int ret; do { ret = pthread_rwlock_destroy(&lock->rwlock); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to destroy a lock, %s", strerror(ret)); } static inline void sd_read_lock(struct sd_rw_lock *lock) { int ret; do { ret = pthread_rwlock_rdlock(&lock->rwlock); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to lock for reading, %s", strerror(ret)); } /* * Even though POSIX manual it doesn't return EAGAIN, we indeed have met the * case that it returned EAGAIN */ static inline void sd_write_lock(struct sd_rw_lock *lock) { int ret; do { ret = pthread_rwlock_wrlock(&lock->rwlock); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to lock for writing, %s", strerror(ret)); } static inline void sd_rw_unlock(struct sd_rw_lock *lock) { int ret; do { ret = pthread_rwlock_unlock(&lock->rwlock); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to unlock, %s", strerror(ret)); } /* colors */ #define TEXT_NORMAL "\033[0m" #define TEXT_BOLD "\033[1m" #define TEXT_RED "\033[0;31m" #define TEXT_BOLD_RED "\033[1;31m" #define TEXT_GREEN "\033[0;32m" #define TEXT_BOLD_GREEN "\033[1;32m" #define TEXT_YELLOW "\033[0;33m" #define TEXT_BOLD_YELLOW "\033[1;33m" #define TEXT_BLUE "\033[0;34m" #define TEXT_BOLD_BLUE "\033[1;34m" #define TEXT_MAGENTA "\033[0;35m" #define TEXT_BOLD_MAGENTA "\033[1;35m" #define TEXT_CYAN "\033[0;36m" #define TEXT_BOLD_CYAN "\033[1;36m" #define CLEAR_SCREEN "\033[2J" #define RESET_CURSOR "\033[1;1H" static inline bool is_stdin_console(void) { return isatty(STDIN_FILENO); } static inline bool is_stdout_console(void) { return isatty(STDOUT_FILENO); } static inline void clear_screen(void) { printf(CLEAR_SCREEN); printf(RESET_CURSOR); } extern mode_t sd_def_fmode; extern mode_t sd_def_dmode; #endif sheepdog-0.8.3/include/work.h000066400000000000000000000032541237656255000161150ustar00rootroot00000000000000#ifndef __WORK_H__ #define __WORK_H__ #include #include "list.h" #include "util.h" struct work; typedef void (*work_func_t)(struct work *); struct work { struct list_node w_list; work_func_t fn; work_func_t done; }; struct work_queue { int wq_state; struct list_head pending_list; }; enum wq_thread_control { WQ_ORDERED, /* Only 1 thread created for work queue */ WQ_DYNAMIC, /* # of threads proportional to nr_nodes created */ WQ_UNLIMITED, /* Unlimited # of threads created */ }; static inline bool is_main_thread(void) { return gettid() == getpid(); } static inline bool is_worker_thread(void) { return !is_main_thread(); } /* * Helper macros to guard variables from being accessed out of the * main thread. Note that we can use these only for pointers. */ #define main_thread(type) struct { type __val; } #define main_thread_get(var) \ ({ \ assert(is_main_thread()); \ (var).__val; \ }) #define main_thread_set(var, val) \ ({ \ assert(is_main_thread()); \ (var).__val = (val); \ }) /* * 'get_nr_nodes' is the function to get the current number of nodes and used * for dynamic work queues. 'create_cb' will be called when worker threads are * created and 'destroy_cb' will be called when worker threads are destroyed. */ int init_work_queue(size_t (*get_nr_nodes)(void)); struct work_queue *create_work_queue(const char *name, enum wq_thread_control); struct work_queue *create_ordered_work_queue(const char *name); void queue_work(struct work_queue *q, struct work *work); bool work_queue_empty(struct work_queue *q); #ifdef HAVE_TRACE void suspend_worker_threads(void); void resume_worker_threads(void); #endif /* HAVE_TRACE */ #endif sheepdog-0.8.3/lib/000077500000000000000000000000001237656255000141015ustar00rootroot00000000000000sheepdog-0.8.3/lib/Makefile.am000066400000000000000000000010341237656255000161330ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include noinst_LIBRARIES = libsheepdog.a libsheepdog_a_SOURCES = event.c logger.c net.c util.c rbtree.c strbuf.c \ sha1.c option.c work.c sockfd_cache.c fec.c sd_inode.c if BUILD_SHA1_HW libsheepdog_a_SOURCES += sha1_ssse3.S endif if BUILD_TRACE AM_CPPFLAGS += -DENABLE_TRACE endif # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(libsheepdog_a_SOURCES) sheepdog-0.8.3/lib/event.c000066400000000000000000000105701237656255000153710ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "rbtree.h" #include "util.h" #include "event.h" static int efd; static struct rb_root events_tree = RB_ROOT; static void timer_handler(int fd, int events, void *data) { struct timer *t = data; uint64_t val; if (read(fd, &val, sizeof(val)) < 0) return; t->callback(t->data); unregister_event(fd); close(fd); } void add_timer(struct timer *t, unsigned int mseconds) { struct itimerspec it; int tfd; tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); if (tfd < 0) { sd_err("timerfd_create: %m"); return; } memset(&it, 0, sizeof(it)); it.it_value.tv_sec = mseconds / 1000; it.it_value.tv_nsec = (mseconds % 1000) * 1000000; if (timerfd_settime(tfd, 0, &it, NULL) < 0) { sd_err("timerfd_settime: %m"); return; } if (register_event(tfd, timer_handler, t) < 0) sd_err("failed to register timer fd"); } struct event_info { event_handler_t handler; int fd; void *data; struct rb_node rb; int prio; }; static struct epoll_event *events; static int nr_events; static int event_cmp(const struct event_info *e1, const struct event_info *e2) { return intcmp(e1->fd, e2->fd); } int init_event(int nr) { nr_events = nr; events = xcalloc(nr_events, sizeof(struct epoll_event)); efd = epoll_create(nr); if (efd < 0) { sd_err("failed to create epoll fd"); return -1; } return 0; } static struct event_info *lookup_event(int fd) { struct event_info key = { .fd = fd }; return rb_search(&events_tree, &key, rb, event_cmp); } int register_event_prio(int fd, event_handler_t h, void *data, int prio) { int ret; struct epoll_event ev; struct event_info *ei; ei = xzalloc(sizeof(*ei)); ei->fd = fd; ei->handler = h; ei->data = data; ei->prio = prio; memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN; ev.data.ptr = ei; ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev); if (ret) { sd_err("failed to add epoll event for fd %d: %m", fd); free(ei); } else rb_insert(&events_tree, ei, rb, event_cmp); return ret; } void unregister_event(int fd) { int ret; struct event_info *ei; ei = lookup_event(fd); if (!ei) return; ret = epoll_ctl(efd, EPOLL_CTL_DEL, fd, NULL); if (ret) sd_err("failed to delete epoll event for fd %d: %m", fd); rb_erase(&ei->rb, &events_tree); free(ei); /* * Although ei is no longer valid pointer, ei->handler() might be about * to be called in do_event_loop(). Refreshing the event loop is safe. */ event_force_refresh(); } int modify_event(int fd, unsigned int new_events) { int ret; struct epoll_event ev; struct event_info *ei; ei = lookup_event(fd); if (!ei) { sd_err("event info for fd %d not found", fd); return 1; } memset(&ev, 0, sizeof(ev)); ev.events = new_events; ev.data.ptr = ei; ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev); if (ret) { sd_err("failed to modify epoll event for fd %d: %m", fd); return 1; } return 0; } static bool event_loop_refresh; void event_force_refresh(void) { event_loop_refresh = true; } static int epoll_event_cmp(const struct epoll_event *_a, struct epoll_event *_b) { struct event_info *a, *b; a = (struct event_info *)_a->data.ptr; b = (struct event_info *)_b->data.ptr; /* we need sort event_info array in reverse order */ return intcmp(b->prio, a->prio); } static void do_event_loop(int timeout, bool sort_with_prio) { int i, nr; refresh: event_loop_refresh = false; nr = epoll_wait(efd, events, nr_events, timeout); if (sort_with_prio) xqsort(events, nr, epoll_event_cmp); if (nr < 0) { if (errno == EINTR) return; sd_err("epoll_wait failed: %m"); exit(1); } else if (nr) { for (i = 0; i < nr; i++) { struct event_info *ei; ei = (struct event_info *)events[i].data.ptr; ei->handler(ei->fd, events[i].events, ei->data); if (event_loop_refresh) goto refresh; } } } void event_loop(int timeout) { do_event_loop(timeout, false); } void event_loop_prio(int timeout) { do_event_loop(timeout, true); } sheepdog-0.8.3/lib/fec.c000066400000000000000000000471561237656255000150170ustar00rootroot00000000000000/* * zfec -- fast forward error correction * * Copyright (C) 2007-2010 Zooko Wilcox-O'Hearn * Author: Zooko Wilcox-O'Hearn * * This file is part of zfec. * * Imported by Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * This work is derived from the "fec" software by Luigi Rizzo, et al., the * copyright notice and licence terms of which are included below for reference. * fec.c -- forward error correction based on Vandermonde matrices 980624 (C) * 1997-98 Luigi Rizzo (luigi@iet.unipi.it) * * Portions derived from code by Phil Karn (karn@ka9q.ampr.org), * Robert Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari * Thirumoorthy (harit@spectra.eng.hawaii.edu), Aug 1995 * * Modifications by Dan Rubenstein (see Modifications.txt for * their description. * Modifications (C) 1998 Dan Rubenstein (drubenst@cs.umass.edu) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. */ #include #include #include #include "fec.h" #include "util.h" /* * Primitive polynomials - see Lin & Costello, Appendix A, * and Lee & Messerschmitt, p. 453. */ static const char *const Pp = "101110001"; /* * To speed up computations, we have tables for logarithm, exponent and * inverse of a number. We use a table for multiplication as well (it takes * 64K, no big deal even on a PDA, especially because it can be * pre-initialized an put into a ROM!), otherwhise we use a table of * logarithms. In any case the macro gf_mul(x,y) takes care of * multiplications. */ static uint8_t gf_exp[510]; /* idx->poly form conversion table */ static int gf_log[256]; /* Poly->idx form conversion table */ static uint8_t inverse[256]; /* inverse of field elem. */ /* inv[\alpha**i]=\alpha**(GF_SIZE-i-1) */ /* * modnn(x) computes x % GF_SIZE, where GF_SIZE is 2**GF_BITS - 1, * without a slow divide. */ static uint8_t modnn(int x) { while (x >= 255) { x -= 255; x = (x >> 8) + (x & 255); } return x; } /* * gf_mul(x,y) multiplies two numbers. It is much faster to use a * multiplication table. * * USE_GF_MULC, GF_MULC0(c) and GF_ADDMULC(x) can be used when multiplying * many numbers by the same constant. In this case the first call sets the * constant, and others perform the multiplications. A value related to the * multiplication is held in a local variable declared with USE_GF_MULC . See * usage in _addmul1(). */ static uint8_t gf_mul_table[256][256]; #define gf_mul(x, y) gf_mul_table[x][y] #define USE_GF_MULC register uint8_t *__gf_mulc_ #define GF_MULC0(c) __gf_mulc_ = gf_mul_table[c] #define GF_ADDMULC(dst, x) dst ^= __gf_mulc_[x] /* * Generate GF(2**m) from the irreducible polynomial p(X) in p[0]..p[m] * Lookup tables: * idx->polynomial form gf_exp[] contains j= \alpha^i; * polynomial form -> idx form gf_log[ j = \alpha^i ] = i * \alpha=x is the primitive element of GF(2^m) * * For efficiency, gf_exp[] has size 2*GF_SIZE, so that a simple * multiplication of two numbers can be resolved without calling modnn */ static void _init_mul_table(void) { int i, j; for (i = 0; i < 256; i++) for (j = 0; j < 256; j++) gf_mul_table[i][j] = gf_exp[modnn(gf_log[i] + gf_log[j])]; for (j = 0; j < 256; j++) gf_mul_table[0][j] = gf_mul_table[j][0] = 0; } #define NEW_GF_MATRIX(rows, cols) \ (uint8_t *)xmalloc(rows * cols) /* initialize the data structures used for computations in GF. */ static void generate_gf(void) { int i; uint8_t mask; mask = 1; /* x ** 0 = 1 */ gf_exp[8] = 0; /* will be updated at the end of the 1st loop */ /* * first, generate the (polynomial representation of) powers of \alpha, * which are stored in gf_exp[i] = \alpha ** i . * At the same time build gf_log[gf_exp[i]] = i . * The first 8 powers are simply bits shifted to the left. */ for (i = 0; i < 8; i++, mask <<= 1) { gf_exp[i] = mask; gf_log[gf_exp[i]] = i; /* * If Pp[i] == 1 then \alpha ** i occurs in poly-repr * gf_exp[8] = \alpha ** 8 */ if (Pp[i] == '1') gf_exp[8] ^= mask; } /* * now gf_exp[8] = \alpha ** 8 is complete, so can also * compute its inverse. */ gf_log[gf_exp[8]] = 8; /* * Poly-repr of \alpha ** (i+1) is given by poly-repr of * \alpha ** i shifted left one-bit and accounting for any * \alpha ** 8 term that may occur when poly-repr of * \alpha ** i is shifted. */ mask = 1 << 7; for (i = 9; i < 255; i++) { if (gf_exp[i - 1] >= mask) gf_exp[i] = gf_exp[8] ^ ((gf_exp[i - 1] ^ mask) << 1); else gf_exp[i] = gf_exp[i - 1] << 1; gf_log[gf_exp[i]] = i; } /* log(0) is not defined, so use a special value */ gf_log[0] = 255; /* set the extended gf_exp values for fast multiply */ for (i = 0; i < 255; i++) gf_exp[i + 255] = gf_exp[i]; /* * again special cases. 0 has no inverse. This used to * be initialized to 255, but it should make no difference * since noone is supposed to read from here. */ inverse[0] = 0; inverse[1] = 1; for (i = 2; i <= 255; i++) inverse[i] = gf_exp[255 - gf_log[i]]; } /* Various linear algebra operations that i use often. */ /* * addmul() computes dst[] = dst[] + c * src[] * This is used often, so better optimize it! Currently the loop is * unrolled 16 times, a good value for 486 and pentium-class machines. * The case c=0 is also optimized, whereas c=1 is not. These * calls are unfrequent in my typical apps so I did not bother. */ #define addmul(dst, src, c, sz) \ if (c != 0) \ _addmul1(dst, src, c, sz) #define UNROLL 16 /* 1, 4, 8, 16 */ static void _addmul1(register uint8_t *dst, const register uint8_t *src, uint8_t c, size_t sz) { USE_GF_MULC; const uint8_t *lim = &dst[sz - UNROLL + 1]; GF_MULC0(c); #if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */ for (; dst < lim; dst += UNROLL, src += UNROLL) { GF_ADDMULC(dst[0], src[0]); GF_ADDMULC(dst[1], src[1]); GF_ADDMULC(dst[2], src[2]); GF_ADDMULC(dst[3], src[3]); #if (UNROLL > 4) GF_ADDMULC(dst[4], src[4]); GF_ADDMULC(dst[5], src[5]); GF_ADDMULC(dst[6], src[6]); GF_ADDMULC(dst[7], src[7]); #endif #if (UNROLL > 8) GF_ADDMULC(dst[8], src[8]); GF_ADDMULC(dst[9], src[9]); GF_ADDMULC(dst[10], src[10]); GF_ADDMULC(dst[11], src[11]); GF_ADDMULC(dst[12], src[12]); GF_ADDMULC(dst[13], src[13]); GF_ADDMULC(dst[14], src[14]); GF_ADDMULC(dst[15], src[15]); #endif } #endif lim += UNROLL - 1; for (; dst < lim; dst++, src++) /* final components */ GF_ADDMULC(*dst, *src); } /* computes C = AB where A is dp*d, B is d*m, C is dp*m */ static void _matmul(uint8_t *a, uint8_t *b, uint8_t *c, unsigned dp, unsigned d, unsigned m) { unsigned row, col, i; for (row = 0; row < dp; row++) { for (col = 0; col < m; col++) { uint8_t *pa = &a[row * d]; uint8_t *pb = &b[col]; uint8_t acc = 0; for (i = 0; i < d; i++, pa++, pb += m) acc ^= gf_mul(*pa, *pb); c[row * m + col] = acc; } } } /* * _invert_mat() takes a matrix and produces its inverse * d is the size of the matrix. * (Gauss-Jordan, adapted from Numerical Recipes in C) * Return non-zero if singular. */ static void _invert_mat(uint8_t *src, unsigned d) { uint8_t c, *p; unsigned irow = 0; unsigned icol = 0; unsigned row, col, i, ix; unsigned *indxc = (unsigned *)xmalloc(d * sizeof(unsigned)); unsigned *indxr = (unsigned *)xmalloc(d * sizeof(unsigned)); unsigned *ipiv = (unsigned *)xmalloc(d * sizeof(unsigned)); uint8_t *id_row = NEW_GF_MATRIX(1, d); memset(id_row, '\0', d * sizeof(uint8_t)); /* ipiv marks elements already used as pivots. */ for (i = 0; i < d; i++) ipiv[i] = 0; for (col = 0; col < d; col++) { uint8_t *pivot_row; /* * Zeroing column 'col', look for a non-zero element. * First try on the diagonal, if it fails, look elsewhere. */ if (ipiv[col] != 1 && src[col * d + col] != 0) { irow = col; icol = col; goto found_piv; } for (row = 0; row < d; row++) { if (ipiv[row] != 1) { for (ix = 0; ix < d; ix++) { if (ipiv[ix] == 0) { if (src[row * d + ix] != 0) { irow = row; icol = ix; goto found_piv; } } else assert(ipiv[ix] <= 1); } } } found_piv: ++(ipiv[icol]); /* * swap rows irow and icol, so afterwards the diagonal * element will be correct. Rarely done, not worth * optimizing. */ if (irow != icol) for (ix = 0; ix < d; ix++) SWAP(src[irow*d + ix], src[icol*d + ix]); indxr[col] = irow; indxc[col] = icol; pivot_row = &src[icol * d]; c = pivot_row[icol]; assert(c != 0); if (c != 1) { /* otherwhise this is a NOP */ /* * this is done often , but optimizing is not so * fruitful, at least in the obvious ways (unrolling) */ c = inverse[c]; pivot_row[icol] = 1; for (ix = 0; ix < d; ix++) pivot_row[ix] = gf_mul(c, pivot_row[ix]); } /* * from all rows, remove multiples of the selected row * to zero the relevant entry (in fact, the entry is not zero * because we know it must be zero). * (Here, if we know that the pivot_row is the identity, * we can optimize the addmul). */ id_row[icol] = 1; if (memcmp(pivot_row, id_row, d * sizeof(uint8_t)) != 0) { for (p = src, ix = 0; ix < d; ix++, p += d) { if (ix != icol) { c = p[icol]; p[icol] = 0; addmul(p, pivot_row, c, d); } } } id_row[icol] = 0; } /* done all columns */ for (col = d; col > 0; col--) if (indxr[col-1] != indxc[col-1]) for (row = 0; row < d; row++) SWAP(src[row * d + indxr[col-1]], src[row * d + indxc[col-1]]); free(indxc); free(indxr); free(ipiv); free(id_row); } /* * fast code for inverting a vandermonde matrix. * * NOTE: It assumes that the matrix is not singular and _IS_ a vandermonde * matrix. Only uses the second column of the matrix, containing the p_i's. * * Algorithm borrowed from "Numerical recipes in C" -- sec.2.8, but largely * revised for my purposes. * p = coefficients of the matrix (p_i) * q = values of the polynomial (known) */ static void _invert_vdm(uint8_t *src, unsigned d) { unsigned i, j, row, col; uint8_t *b, *c, *p; uint8_t t, xx; if (d == 1) /* degenerate case, matrix must be p^0 = 1 */ return; /* * c holds the coefficient of P(x) = Prod (x - p_i), i=0..d-1 * b holds the coefficient for the matrix inversion */ c = NEW_GF_MATRIX(1, d); b = NEW_GF_MATRIX(1, d); p = NEW_GF_MATRIX(1, d); for (j = 1, i = 0; i < d; i++, j += d) { c[i] = 0; p[i] = src[j]; /* p[i] */ } /* * construct coeffs. recursively. We know c[d] = 1 (implicit) * and start P_0 = x - p_0, then at each stage multiply by * x - p_i generating P_i = x P_{i-1} - p_i P_{i-1} * After d steps we are done. */ c[d - 1] = p[0]; /* really -p(0), but x = -x in GF(2^m) */ for (i = 1; i < d; i++) { uint8_t p_i = p[i]; /* see above comment */ for (j = d - 1 - (i - 1); j < d - 1; j++) c[j] ^= gf_mul(p_i, c[j + 1]); c[d - 1] ^= p_i; } for (row = 0; row < d; row++) { /* synthetic division etc. */ xx = p[row]; t = 1; b[d - 1] = 1; /* this is in fact c[d] */ for (i = d - 1; i > 0; i--) { b[i-1] = c[i] ^ gf_mul(xx, b[i]); t = gf_mul(xx, t) ^ b[i-1]; } for (col = 0; col < d; col++) src[col * d + row] = gf_mul(inverse[t], b[col]); } free(c); free(b); free(p); return; } void init_fec(void) { generate_gf(); _init_mul_table(); } /* * This section contains the proper FEC encoding/decoding routines. * The encoding matrix is computed starting with a Vandermonde matrix, * and then transforming it into a systematic matrix. */ #define FEC_MAGIC 0xFECC0DEC void fec_free(struct fec *p) { assert(p != NULL && p->magic == (((FEC_MAGIC ^ p->d) ^ p->dp) ^ (unsigned long) (p->enc_matrix))); free(p->enc_matrix); free(p); } struct fec *fec_new(unsigned short d, unsigned short dp) { unsigned row, col; uint8_t *p, *tmp_m; struct fec *retval; retval = (struct fec *)xmalloc(sizeof(struct fec)); retval->d = d; retval->dp = dp; retval->enc_matrix = NEW_GF_MATRIX(dp, d); retval->magic = ((FEC_MAGIC^d)^dp)^(unsigned long)(retval->enc_matrix); tmp_m = NEW_GF_MATRIX(dp, d); /* * fill the matrix with powers of field elements, starting from 0. * The first row is special, cannot be computed with exp. table. */ tmp_m[0] = 1; for (col = 1; col < d; col++) tmp_m[col] = 0; for (p = tmp_m + d, row = 0; row < dp - 1; row++, p += d) for (col = 0; col < d; col++) p[col] = gf_exp[modnn(row * col)]; /* * quick code to build systematic matrix: invert the top * d*d vandermonde matrix, multiply right the bottom dp-d rows * by the inverse, and construct the identity matrix at the top. */ _invert_vdm(tmp_m, d); /* much faster than _invert_mat */ _matmul(tmp_m + d * d, tmp_m, retval->enc_matrix + d * d, dp - d, d, d); /* the upper matrix is I so do not bother with a slow multiply */ memset(retval->enc_matrix, '\0', d * d * sizeof(uint8_t)); for (p = retval->enc_matrix, col = 0; col < d; col++, p += d + 1) *p = 1; free(tmp_m); return retval; } /* * To make sure that we stay within cache in the inner loops of fec_encode(). * (It would probably help to also do this for fec_decode(). */ #ifndef STRIDE #define STRIDE 8192 #endif void fec_encode(const struct fec *code, const uint8_t *const *const src, uint8_t *const *const fecs, const int *const block_nums, size_t num_block_nums, size_t sz) { unsigned char i, j; size_t d; unsigned fecnum; const uint8_t *p; for (d = 0; d < sz; d += STRIDE) { size_t stride = ((sz-d) < STRIDE) ? (sz-d) : STRIDE; for (i = 0; i < num_block_nums; i++) { fecnum = block_nums[i]; assert(fecnum >= code->d); memset(fecs[i]+d, 0, stride); p = &(code->enc_matrix[fecnum * code->d]); for (j = 0; j < code->d; j++) addmul(fecs[i]+d, src[j]+d, p[j], stride); } } } /* * Build decode matrix into some memory space. * * @param matrix a space allocated for a d by d matrix */ static void build_decode_matrix_into_space(const struct fec *const code, const int *const idx, const unsigned d, uint8_t *const matrix) { unsigned char i; uint8_t *p; for (i = 0, p = matrix; i < d; i++, p += d) { if (idx[i] < d) { memset(p, 0, d); p[i] = 1; } else { memcpy(p, &(code->enc_matrix[idx[i] * code->d]), d); } } _invert_mat(matrix, d); } void fec_decode(const struct fec *code, const uint8_t *const *const inpkts, uint8_t *const *const outpkts, const int *const idx, size_t sz) { uint8_t m_dec[code->d * code->d]; unsigned char outix = 0; unsigned char row = 0; unsigned char col = 0; assert(code->d * code->d < 8 * 1024 * 1024); build_decode_matrix_into_space(code, idx, code->d, m_dec); for (row = 0; row < code->d; row++) { /* * If the block whose number is i is present, then it is * required to be in the i'th element. */ assert((idx[row] >= code->d) || (idx[row] == row)); if (idx[row] >= code->d) { memset(outpkts[outix], 0, sz); for (col = 0; col < code->d; col++) addmul(outpkts[outix], inpkts[col], m_dec[row * code->d + col], sz); outix++; } } } /* * fec_decode need primary(data) strips in the numeric place, e,g, we have * indexes passed as { 0, 2, 4, 5 } and 4, 5 are parity strip, we need to pass * { 0, 4, 2, 5 } (we choose this form) or { 0, 5, 2, 4} to it. * * Return out and outidx as fec_decode requested. */ static inline void decode_prepare(struct fec *ctx, const uint8_t *dp[], const uint8_t *out[], int outidx[]) { int i, p = 0; for (i = ctx->d; i < ctx->dp; i++) { if (dp[i]) { p = i; break; } } for (i = 0; i < ctx->d; i++) { if (dp[i]) { out[i] = dp[i]; outidx[i] = i; } else { assert(p < ctx->dp); out[i] = dp[p]; outidx[i] = p; while (++p < ctx->dp && !dp[p]) ; } } } static inline bool data_is_missing(const uint8_t *dp[], int d) { for (int i = 0; i < d; i++) if (!dp[i]) return true; return false; } /* * This function takes input strips and return the lost strip * * @input: strips (either ds or ps) that are used to generate lost strips * @inidx: indexes of each input strip in the whole stripe, must be in numeric * order such as { 0, 2, 4, 5 } * @output: the lost ds or ps to return * @idx: index of output which is lost */ void ec_decode(struct fec *ctx, const uint8_t *input[], const int inidx[], uint8_t output[], int idx) { int edp = ctx->dp, ep = ctx->dp - ctx->d, ed = ctx->d; const uint8_t *dp[edp]; const uint8_t *oin[ed]; int oidx[ed], i; int strip_size = SD_EC_DATA_STRIPE_SIZE / ed; uint8_t m0[strip_size], m1[strip_size], m2[strip_size], m3[strip_size], m4[strip_size], m5[strip_size], m6[strip_size], m7[strip_size], m8[strip_size], m9[strip_size], m10[strip_size], m11[strip_size], m12[strip_size], m13[strip_size], m14[strip_size], m15[strip_size], p0[strip_size], p1[strip_size], p2[strip_size], p3[strip_size], p4[strip_size], p5[strip_size], p6[strip_size], p7[strip_size], p8[strip_size], p9[strip_size], p10[strip_size], p11[strip_size], p12[strip_size], p13[strip_size], p14[strip_size]; uint8_t *missing[SD_EC_MAX_STRIP] = { m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15 }; uint8_t *p[SD_EC_MAX_STRIP - 1] = { p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14 }; for (i = 0; i < edp; i++) dp[i] = NULL; for (i = 0; i < ed; i++) oin[i] = NULL; for (i = 0; i < ed; i++) oidx[i] = 0; for (i = 0; i < ed; i++) dp[inidx[i]] = input[i]; decode_prepare(ctx, dp, oin, oidx); /* Fill the data strip if missing */ if (data_is_missing(dp, ed)) { int m = 0; fec_decode(ctx, oin, missing, oidx, strip_size); for (i = 0; i < ed; i++) if (!dp[i]) dp[i] = missing[m++]; } if (idx < ed) goto out; /* Fill the parity strip */ ec_encode(ctx, dp, p); for (i = 0; i < ep; i++) dp[ed + i] = p[i]; out: memcpy(output, dp[idx], strip_size); } void ec_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[], char *buf, int idx) { int i, j, d = ctx->d; size_t strip_size = SD_EC_DATA_STRIPE_SIZE / d; for (i = 0; i < SD_EC_NR_STRIPE_PER_OBJECT; i++) { const uint8_t *in[d]; uint8_t out[strip_size]; for (j = 0; j < d; j++) in[j] = input[j] + strip_size * i; ec_decode(ctx, in, in_idx, out, idx); memcpy(buf + strip_size * i, out, strip_size); } } sheepdog-0.8.3/lib/logger.c000066400000000000000000000437031237656255000155330ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on log.c from Linux target framework (tgt): * Copyright (C) 2002-2003 Ardis Technolgies */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" static bool colorize; static const char * const log_color[] = { [SDOG_EMERG] = TEXT_BOLD_RED, [SDOG_ALERT] = TEXT_BOLD_RED, [SDOG_CRIT] = TEXT_BOLD_RED, [SDOG_ERR] = TEXT_BOLD_RED, [SDOG_WARNING] = TEXT_BOLD_YELLOW, [SDOG_NOTICE] = TEXT_BOLD_CYAN, [SDOG_INFO] = TEXT_CYAN, [SDOG_DEBUG] = TEXT_GREEN, }; static const char * const log_prio_str[] = { [SDOG_EMERG] = "EMERG", [SDOG_ALERT] = "ALERT", [SDOG_CRIT] = "CRIT", [SDOG_ERR] = "ERROR", [SDOG_WARNING] = "WARN", [SDOG_NOTICE] = "NOTICE", [SDOG_INFO] = "INFO", [SDOG_DEBUG] = "DEBUG", }; static struct logger_user_info *logger_user_info; static void dolog(int prio, const char *func, int line, const char *fmt, va_list ap) __printf(4, 0); union semun { int val; struct semid_ds *buf; unsigned short int *array; struct seminfo *__buf; }; struct logarea { bool active; char *tail; char *start; char *end; int semid; union semun semarg; int fd; }; #define FUNC_NAME_SIZE 32 /* according to C89, including '\0' */ struct logmsg { struct timeval tv; int prio; char func[FUNC_NAME_SIZE]; int line; char worker_name[MAX_THREAD_NAME_LEN]; int worker_idx; size_t str_len; char str[0]; }; typedef int (*formatter_fn)(char *, size_t, const struct logmsg *, bool); struct log_format { const char *name; formatter_fn formatter; struct list_node list; }; static LIST_HEAD(log_formats); static struct log_format *format; static int log_fd = -1; static __thread const char *worker_name; static __thread int worker_idx; static struct logarea *la; static const char *log_name; static char *log_nowname; int sd_log_level = SDOG_INFO; static pid_t sheep_pid; pid_t logger_pid = -1; static key_t semkey; static char *log_buff; static int64_t max_logsize = 500 * 1024 * 1024; /*500MB*/ static enum log_dst_type dst_type = LOG_DST_STDOUT; /* * block_sighup() * * used for protecting log_fd from SIGHUP rotation */ static void block_sighup(void) { int ret; sigset_t new, old; sigemptyset(&new); sigemptyset(&old); sigaddset(&new, SIGHUP); ret = sigprocmask(SIG_BLOCK, &new, &old); if (ret < 0) syslog(LOG_ERR, "blocking SIGHUP failed\n"); } static void unblock_sighup(void) { int ret; sigset_t new, old; sigemptyset(&new); sigemptyset(&old); sigaddset(&new, SIGHUP); ret = sigprocmask(SIG_UNBLOCK, &new, &old); if (ret < 0) syslog(LOG_ERR, "unblock SIGHUP failed\n"); } static const char *format_thread_name(char *str, size_t size, const char *name, int idx) { if (name && name[0] && idx) snprintf(str, size, "%s %d", name, idx); else if (name && name[0]) snprintf(str, size, "%s", name); else snprintf(str, size, "main"); return str; } static int server_log_formatter(char *buff, size_t size, const struct logmsg *msg, bool print_time) { char *p = buff; struct tm tm; ssize_t len; char thread_name[MAX_THREAD_NAME_LEN]; if (print_time) { localtime_r(&msg->tv.tv_sec, &tm); len = strftime(p, size, "%b %2d %H:%M:%S ", (const struct tm *)&tm); p += len; size -= len; } len = snprintf(p, size, "%s%6s %s[%s] %s(%d) %s%s%s", colorize ? log_color[msg->prio] : "", log_prio_str[msg->prio], colorize ? TEXT_YELLOW : "", format_thread_name(thread_name, sizeof(thread_name), msg->worker_name, msg->worker_idx), msg->func, msg->line, colorize ? log_color[msg->prio] : "", msg->str, colorize ? TEXT_NORMAL : ""); if (len < 0) len = 0; p += min((size_t)len, size - 1); return p - buff; } static int default_log_formatter(char *buff, size_t size, const struct logmsg *msg, bool print_time) { size_t len = min(size, msg->str_len); memcpy(buff, msg->str, len); return len; } static int json_log_formatter(char *buff, size_t size, const struct logmsg *msg, bool print_time) { char *p = buff; ssize_t len; assert(logger_user_info); len = snprintf(p, size, "{ \"user_info\": " "{\"program_name\": \"%s\", \"port\": %d}," "\"body\": {" "\"second\": %lu, \"usecond\": %lu, " "\"worker_name\": \"%s\", \"worker_idx\": %d, " "\"func\": \"%s\", \"line\": %d, " "\"msg\": \"", log_name, logger_user_info->port, msg->tv.tv_sec, msg->tv.tv_usec, msg->worker_name[0] ? msg->worker_name : "main", msg->worker_idx, msg->func, msg->line); if (len < 0) return 0; len = min((size_t)len, size - 1); p += len; size -= len; for (int i = 0; i < msg->str_len; i++) { if (size <= 1) break; if (msg->str[i] == '"') { *p++ = '\\'; size--; } if (size <= 1) break; *p++ = msg->str[i]; size--; } pstrcpy(p, size, "\"} }"); p += strlen(p); return p - buff; } static void log_format_register(const char *name, formatter_fn formatter) { struct log_format *f = xmalloc(sizeof(struct log_format)); f->name = name; f->formatter = formatter; list_add(&f->list, &log_formats); } /* * We need to set default log formatter because dog doesn't want to call * select_log_formatter(). */ static void __attribute__((constructor)) init_log_formatter(void) { struct log_format *f; log_format_register("json", json_log_formatter); log_format_register("server", server_log_formatter); log_format_register("default", default_log_formatter); list_for_each_entry(f, &log_formats, list) { if (!strcmp(f->name, "default")) { format = f; return; } } syslog(LOG_ERR, "failed to set default formatter\n"); exit(1); } static int logarea_init(int size) { int shmid; shmid = shmget(IPC_PRIVATE, sizeof(struct logarea), 0644 | IPC_CREAT | IPC_EXCL); if (shmid == -1) { syslog(LOG_ERR, "shmget logarea failed: %m"); return 1; } la = shmat(shmid, NULL, 0); if (!la) { syslog(LOG_ERR, "shmat logarea failed: %m"); return 1; } shmctl(shmid, IPC_RMID, NULL); if (size < MAX_MSG_SIZE) size = LOG_SPACE_SIZE; shmid = shmget(IPC_PRIVATE, size, 0644 | IPC_CREAT | IPC_EXCL); if (shmid == -1) { syslog(LOG_ERR, "shmget msg failed: %m"); shmdt(la); return 1; } la->start = shmat(shmid, NULL, 0); if (!la->start) { syslog(LOG_ERR, "shmat msg failed: %m"); shmdt(la); return 1; } memset(la->start, 0, size); shmctl(shmid, IPC_RMID, NULL); la->end = la->start + size; la->tail = la->start; la->semid = semget(semkey, 1, 0666 | IPC_CREAT); if (la->semid < 0) { syslog(LOG_ERR, "semget failed: %m"); shmdt(la->start); shmdt(la); return 1; } la->semarg.val = 1; if (semctl(la->semid, 0, SETVAL, la->semarg) < 0) { syslog(LOG_ERR, "semctl failed: %m"); shmdt(la->start); shmdt(la); return 1; } return 0; } static void free_logarea(void) { if (log_fd >= 0) close(log_fd); semctl(la->semid, 0, IPC_RMID, la->semarg); shmdt(la->start); shmdt(la); } /* this one can block under memory pressure */ static void log_syslog(const struct logmsg *msg) { char str[MAX_MSG_SIZE]; int len; len = format->formatter(str, sizeof(str) - 1, msg, log_fd >= 0); if (dst_type == LOG_DST_DEFAULT) str[len++] = '\n'; else /* LOG_DST_SYSLOG */ str[len++] = '\0'; block_sighup(); if (log_fd >= 0) xwrite(log_fd, str, len); else syslog(msg->prio, "%s", str); unblock_sighup(); } static void init_logmsg(struct logmsg *msg, struct timeval *tv, int prio, const char *func, int line) { msg->tv = *tv; msg->prio = prio; pstrcpy(msg->func, FUNC_NAME_SIZE, func); msg->line = line; if (worker_name) pstrcpy(msg->worker_name, MAX_THREAD_NAME_LEN, worker_name); else msg->worker_name[0] = '\0'; msg->worker_idx = worker_idx; } static void dolog(int prio, const char *func, int line, const char *fmt, va_list ap) { char buf[sizeof(struct logmsg) + MAX_MSG_SIZE]; char *str = buf + sizeof(struct logmsg); struct logmsg *msg = (struct logmsg *)buf; int len = 0; struct timeval tv; gettimeofday(&tv, NULL); len = vsnprintf(str, MAX_MSG_SIZE, fmt, ap); if (len < 0) { syslog(LOG_ERR, "vsnprintf failed"); return; } msg->str_len = min(len, MAX_MSG_SIZE - 1); if (la) { struct sembuf ops; ops.sem_num = 0; ops.sem_flg = SEM_UNDO; ops.sem_op = -1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop up failed: %m"); return; } /* not enough space: drop msg */ if (len + sizeof(struct logmsg) + 1 > la->end - la->tail) syslog(LOG_ERR, "enqueue: log area overrun, " "dropping message\n"); else { /* ok, we can stage the msg in the area */ msg = (struct logmsg *)la->tail; init_logmsg(msg, &tv, prio, func, line); memcpy(msg->str, str, len + 1); msg->str_len = len; la->tail += sizeof(struct logmsg) + len + 1; } ops.sem_op = 1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop down failed: %m"); return; } } else { char str_final[MAX_MSG_SIZE]; init_logmsg(msg, &tv, prio, func, line); len = format->formatter(str_final, sizeof(str_final) - 1, msg, true); str_final[len++] = '\n'; xwrite(fileno(stderr), str_final, len); fflush(stderr); } } static void rotate_log(void) { int new_fd; if (access(log_nowname, R_OK) == 0) { char old_logfile[256]; time_t t; struct tm tm; time(&t); localtime_r((const time_t *)&t, &tm); snprintf(old_logfile, sizeof(old_logfile), "%s.%04d-%02d-%02d-%02d-%02d", log_nowname, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min); rename(log_nowname, old_logfile); } new_fd = open(log_nowname, O_RDWR | O_CREAT | O_APPEND, 0644); if (new_fd < 0) { syslog(LOG_ERR, "failed to create new log file\n"); exit(1); } if (dup2(new_fd, log_fd) < 0) { syslog(LOG_ERR, "failed to dup2 the log fd\n"); exit(1); } close(new_fd); } void log_write(int prio, const char *func, int line, const char *fmt, ...) { va_list ap; if (prio > sd_log_level) return; va_start(ap, fmt); dolog(prio, func, line, fmt, ap); va_end(ap); } static void log_flush(void) { struct sembuf ops; size_t size, done = 0; const struct logmsg *msg; if (la->tail == la->start) return; ops.sem_num = 0; ops.sem_flg = SEM_UNDO; ops.sem_op = -1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop up failed: %m"); exit(1); } size = la->tail - la->start; memcpy(log_buff, la->start, size); memset(la->start, 0, size); la->tail = la->start; ops.sem_op = 1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop down failed: %m"); exit(1); } while (done < size) { msg = (const struct logmsg *)(log_buff + done); log_syslog(msg); done += sizeof(*msg) + msg->str_len + 1; } } static bool is_sheep_dead(int signo) { return signo == SIGHUP; } static void crash_handler(int signo) { if (is_sheep_dead(signo)) sd_err("sheep pid %d exited unexpectedly.", sheep_pid); else { sd_err("logger pid %d exits unexpectedly (%s).", getpid(), strsignal(signo)); sd_backtrace(); } log_flush(); closelog(); free_logarea(); /* If the signal isn't caused by the logger crash, we simply exit. */ if (is_sheep_dead(signo)) exit(1); reraise_crash_signal(signo, 1); } static void sighup_handler(int signo) { rotate_log(); } static void logger(char *log_dir, char *outfile) { int fd; log_buff = xzalloc(la->end - la->start); if (dst_type == LOG_DST_DEFAULT) { log_fd = open(outfile, O_CREAT | O_RDWR | O_APPEND, 0644); if (log_fd < 0) { syslog(LOG_ERR, "failed to open %s\n", outfile); exit(1); } } la->active = true; fd = open("/dev/null", O_RDWR); if (fd < 0) { syslog(LOG_ERR, "failed to open /dev/null: %m\n"); exit(1); } dup2(fd, 0); dup2(fd, 1); dup2(fd, 2); setsid(); if (chdir(log_dir) < 0) { syslog(LOG_ERR, "failed to chdir to %s: %m\n", log_dir); exit(1); } /* flush when either the logger or its parent dies */ install_crash_handler(crash_handler); install_sighandler(SIGHUP, sighup_handler, false); /* * we need to check the aliveness of the sheep process since * it could die before the logger call prctl. */ if (kill(sheep_pid, 0) < 0) kill(logger_pid, SIGHUP); while (la->active) { log_flush(); block_sighup(); if (dst_type == LOG_DST_DEFAULT && max_logsize) { off_t offset; offset = lseek(log_fd, 0, SEEK_END); if (offset < 0) { syslog(LOG_ERR, "sheep log error\n"); } else { size_t log_size = (size_t)offset; if (log_size >= max_logsize) rotate_log(); } } unblock_sighup(); if (getppid() == 1) /* My parent (sheep process) is dead. */ break; sleep(1); } log_flush(); free(log_buff); free_logarea(); exit(0); } void early_log_init(const char *format_name, struct logger_user_info *user_info) { struct log_format *f; logger_user_info = user_info; list_for_each_entry(f, &log_formats, list) { if (!strcmp(f->name, format_name)) { format = f; return; } } sd_err("invalid log format: %s", format_name); sd_err("valid options are:"); list_for_each_entry(f, &log_formats, list) { sd_err("\t%s", f->name); } exit(1); } int log_init(const char *program_name, enum log_dst_type type, int level, char *outfile) { char log_dir[PATH_MAX], tmp[PATH_MAX]; int size = level == SDOG_DEBUG ? LOG_SPACE_DEBUG_SIZE : LOG_SPACE_SIZE; dst_type = type; sd_log_level = level; log_name = program_name; log_nowname = outfile; pstrcpy(tmp, sizeof(tmp), outfile); pstrcpy(log_dir, sizeof(log_dir), dirname(tmp)); semkey = random(); switch (type) { case LOG_DST_STDOUT: if (is_stdout_console()) colorize = true; break; case LOG_DST_SYSLOG: openlog(program_name, LOG_PID, LOG_DAEMON); /* fall through */ case LOG_DST_DEFAULT: if (logarea_init(size)) { syslog(LOG_ERR, "failed to initialize the logger\n"); return 1; } /* * Store the pid of the sheep process for use by the death * signal handler. By the time the child is notified of * the parents death the parent has been reparanted to init * and getppid() will always return 1. */ sheep_pid = getpid(); logger_pid = fork(); if (logger_pid < 0) { syslog(LOG_ERR, "failed to fork the logger process: %m\n"); return 1; } if (logger_pid) syslog(LOG_WARNING, "logger pid %d starting\n", logger_pid); else logger(log_dir, outfile); break; default: sd_err("unknown type of log destination type: %d", type); return -1; } return 0; } void log_close(void) { pid_t pid; if (!la) return; while (true) { la->active = false; pid = waitpid(logger_pid, NULL, WNOHANG); if (pid == 0) { usleep(100000); continue; } else if (pid > 0) { syslog(LOG_WARNING, "logger pid %d stopped\n", logger_pid); closelog(); free_logarea(); break; } else { syslog(LOG_ERR, "waitpid() failure\n"); exit(1); } } } void set_thread_name(const char *name, bool show_idx) { worker_name = name; if (show_idx) worker_idx = gettid(); } void get_thread_name(char *name) { format_thread_name(name, MAX_THREAD_NAME_LEN, worker_name, worker_idx); } #define SD_MAX_STACK_DEPTH 1024 static bool check_gdb(void) { return system("which gdb > /dev/null") == 0; } #define SD_ARG_MAX (sysconf(_SC_ARG_MAX)) static int gdb_cmd(const char *cmd) { char time_str[256], cmd_str[SD_ARG_MAX]; time_t ti; struct tm tm; if (!check_gdb()) { sd_debug("cannot find gdb"); return -1; } time(&ti); localtime_r(&ti, &tm); strftime(time_str, sizeof(time_str), "%b %2d %H:%M:%S ", &tm); snprintf(cmd_str, sizeof(cmd_str), "gdb -nw %s %d -batch >/dev/null 2>&1" " -ex 'set logging on'" " -ex 'echo \\n'" " -ex 'echo ==\\n'" " -ex 'echo == %s\\n'" " -ex 'echo == program: %s\\n'" " -ex 'echo == command: %s\\n'" " -ex 'echo ==\\n'" " -ex '%s'" " -ex 'set logging off'", my_exe_path(), getpid(), time_str, my_exe_path(), cmd, cmd); return system(cmd_str); } int __sd_dump_variable(const char *var) { char cmd[256]; snprintf(cmd, sizeof(cmd), "p %s", var); return gdb_cmd(cmd); } static int dump_stack_frames(void) { return gdb_cmd("thread apply all where full"); } __attribute__ ((__noinline__)) void sd_backtrace(void) { void *addrs[SD_MAX_STACK_DEPTH]; int i, n = backtrace(addrs, ARRAY_SIZE(addrs)); for (i = 1; i < n; i++) { /* addrs[0] is here, so skip it */ void *addr = addrs[i]; char cmd[SD_ARG_MAX], info[256], **str; FILE *f; /* * The called function is at the previous address * because addr contains a return address */ addr = (void *)((char *)addr - 1); /* try to get a line number with addr2line if possible */ snprintf(cmd, sizeof(cmd), "addr2line -s -e %s -f -i %p | " "perl -e '@a=<>; chomp @a; print \"$a[1]: $a[0]\"'", my_exe_path(), addr); f = popen(cmd, "r"); if (!f) goto fallback; if (fgets(info, sizeof(info), f) == NULL) goto fallback_close; if (info[0] != '?' && info[0] != '\0') sd_emerg("%s", chomp(info)); else goto fallback_close; pclose(f); continue; /* * Failed to get a line number, so simply use * backtrace_symbols instead */ fallback_close: pclose(f); fallback: str = backtrace_symbols(&addr, 1); sd_emerg("%s", *str); free(str); } /* dump the stack frames if possible*/ dump_stack_frames(); } void set_loglevel(int new_loglevel) { assert(SDOG_EMERG <= new_loglevel && new_loglevel <= SDOG_DEBUG); sd_log_level = new_loglevel; } int get_loglevel(void) { return sd_log_level; } sheepdog-0.8.3/lib/net.c000066400000000000000000000273431237656255000150440ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "util.h" #include "event.h" #include "net.h" int conn_tx_off(struct connection *conn) { conn->events &= ~EPOLLOUT; return modify_event(conn->fd, conn->events); } int conn_tx_on(struct connection *conn) { conn->events |= EPOLLOUT; return modify_event(conn->fd, conn->events); } int conn_rx_off(struct connection *conn) { conn->events &= ~EPOLLIN; return modify_event(conn->fd, conn->events); } int conn_rx_on(struct connection *conn) { conn->events |= EPOLLIN; return modify_event(conn->fd, conn->events); } int create_listen_ports(const char *bindaddr, int port, int (*callback)(int fd, void *), void *data) { char servname[64]; int fd, ret, opt; int success = 0; struct addrinfo hints, *res, *res0; memset(servname, 0, sizeof(servname)); snprintf(servname, sizeof(servname), "%d", port); memset(&hints, 0, sizeof(hints)); hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; ret = getaddrinfo(bindaddr, servname, &hints, &res0); if (ret) { sd_err("failed to get address info: %m"); return 1; } for (res = res0; res; res = res->ai_next) { fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); if (fd < 0) continue; opt = 1; ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); if (ret) sd_err("failed to set SO_REUSEADDR: %m"); opt = 1; if (res->ai_family == AF_INET6) { ret = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &opt, sizeof(opt)); if (ret) { close(fd); continue; } } ret = bind(fd, res->ai_addr, res->ai_addrlen); if (ret) { sd_err("failed to bind server socket: %m"); close(fd); continue; } ret = listen(fd, SOMAXCONN); if (ret) { sd_err("failed to listen on server socket: %m"); close(fd); continue; } ret = callback(fd, data); if (ret) { close(fd); continue; } success++; } freeaddrinfo(res0); if (!success) sd_err("failed to create a listening port"); return !success; } int connect_to(const char *name, int port) { char buf[64]; char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; int fd, ret; struct addrinfo hints, *res, *res0; struct linger linger_opt = {1, 0}; memset(&hints, 0, sizeof(hints)); snprintf(buf, sizeof(buf), "%d", port); hints.ai_socktype = SOCK_STREAM; ret = getaddrinfo(name, buf, &hints, &res0); if (ret) { sd_err("failed to get address info: %m"); return -1; } for (res = res0; res; res = res->ai_next) { ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf), sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV); if (ret) continue; fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); if (fd < 0) continue; ret = setsockopt(fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt)); if (ret) { sd_err("failed to set SO_LINGER: %m"); close(fd); continue; } ret = set_snd_timeout(fd); if (ret) { sd_err("failed to set send timeout: %m"); close(fd); break; } ret = set_rcv_timeout(fd); if (ret) { sd_err("failed to set recv timeout: %m"); close(fd); break; } reconnect: ret = connect(fd, res->ai_addr, res->ai_addrlen); if (ret) { if (errno == EINTR) goto reconnect; sd_err("failed to connect to %s:%d: %m", name, port); close(fd); continue; } ret = set_nodelay(fd); if (ret) { sd_err("%m"); close(fd); break; } else goto success; } fd = -1; success: freeaddrinfo(res0); sd_debug("%d, %s:%d", fd, name, port); return fd; } int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret, repeat = max_count; reread: ret = read(sockfd, buf, len); if (ret == 0) { sd_debug("connection is closed (%d bytes left)", len); return 1; } if (ret < 0) { if (errno == EINTR) goto reread; /* * Since we set timeout for read, we'll get EAGAIN even for * blocking sockfd. */ if (errno == EAGAIN && repeat && (need_retry == NULL || need_retry(epoch))) { repeat--; goto reread; } sd_err("failed to read from socket: %d, %m", ret); return 1; } len -= ret; buf = (char *)buf + ret; if (len) goto reread; return 0; } static void forward_iov(struct msghdr *msg, int len) { while (msg->msg_iov->iov_len <= len) { len -= msg->msg_iov->iov_len; msg->msg_iov++; msg->msg_iovlen--; } msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len; msg->msg_iov->iov_len -= len; } static int do_write(int sockfd, struct msghdr *msg, int len, bool (*need_retry)(uint32_t), uint32_t epoch, uint32_t max_count) { int ret, repeat = max_count; rewrite: ret = sendmsg(sockfd, msg, 0); if (ret < 0) { if (errno == EINTR) goto rewrite; /* * Since we set timeout for write, we'll get EAGAIN even for * blocking sockfd. */ if (errno == EAGAIN && repeat && (need_retry == NULL || need_retry(epoch))) { repeat--; goto rewrite; } sd_err("failed to write to socket: %m"); return 1; } len -= ret; if (len) { forward_iov(msg, ret); goto rewrite; } return 0; } int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret; struct msghdr msg; struct iovec iov[2]; memset(&msg, 0, sizeof(msg)); msg.msg_iov = iov; msg.msg_iovlen = 1; iov[0].iov_base = hdr; iov[0].iov_len = sizeof(*hdr); if (wlen) { msg.msg_iovlen++; iov[1].iov_base = data; iov[1].iov_len = wlen; } ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch, max_count); if (ret) { sd_err("failed to send request %x, %d: %m", hdr->opcode, wlen); ret = -1; } return ret; } int exec_req(int sockfd, struct sd_req *hdr, void *data, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret; struct sd_rsp *rsp = (struct sd_rsp *)hdr; unsigned int wlen, rlen; if (hdr->flags & SD_FLAG_CMD_WRITE) { wlen = hdr->data_length; rlen = 0; } else { wlen = 0; rlen = hdr->data_length; } if (send_req(sockfd, hdr, data, wlen, need_retry, epoch, max_count)) return 1; ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch, max_count); if (ret) { sd_err("failed to read a response"); return 1; } if (rlen > rsp->data_length) rlen = rsp->data_length; if (rlen) { ret = do_read(sockfd, data, rlen, need_retry, epoch, max_count); if (ret) { sd_err("failed to read the response data"); return 1; } } return 0; } const char *addr_to_str(const uint8_t *addr, uint16_t port) { static __thread char str[HOST_NAME_MAX + 8]; int af = AF_INET6; int addr_start_idx = 0; const char *ret; /* Find address family type */ if (addr[12]) { int oct_no = 0; while (!addr[oct_no] && oct_no++ < 12) ; if (oct_no == 12) { af = AF_INET; addr_start_idx = 12; } } ret = inet_ntop(af, addr + addr_start_idx, str, sizeof(str)); if (unlikely(ret == NULL)) panic("failed to convert addr to string, %m"); if (port) { int len = strlen(str); snprintf(str + len, sizeof(str) - len, ":%d", port); } return str; } char *sockaddr_in_to_str(struct sockaddr_in *sockaddr) { int i, si; static char str[32]; uint8_t *addr; si = 0; memset(str, 0, 32); addr = (uint8_t *)&sockaddr->sin_addr.s_addr; for (i = 0; i < 4; i++) { si += snprintf(str + si, 32 - si, i != 3 ? "%d." : "%d", addr[i]); } snprintf(str + si, 32 - si, ":%u", sockaddr->sin_port); return str; } uint8_t *str_to_addr(const char *ipstr, uint8_t *addr) { int addr_start_idx = 0, af = strstr(ipstr, ":") ? AF_INET6 : AF_INET; if (af == AF_INET) { addr_start_idx = 12; memset(addr, 0, addr_start_idx); } if (!inet_pton(af, ipstr, addr + addr_start_idx)) return NULL; return addr; } int set_snd_timeout(int fd) { struct timeval timeout; timeout.tv_sec = POLL_TIMEOUT; timeout.tv_usec = 0; return setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); } int set_rcv_timeout(int fd) { struct timeval timeout; /* * We should wait longer for read than write because the target node might be * busy doing IO */ timeout.tv_sec = MAX_POLLTIME; timeout.tv_usec = 0; return setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); } int set_nodelay(int fd) { int ret, opt; opt = 1; ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); return ret; } /* * Timeout after request is issued after 5s. * * Heart-beat message will be sent periodically with 1s interval. * If the node of the other end of fd fails, we'll detect it in 3s */ int set_keepalive(int fd) { int val = 1; if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 5; if (setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 1; if (setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 3; if (setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } return 0; } int get_local_addr(uint8_t *bytes) { struct ifaddrs *ifaddr, *ifa; int ret = 0; if (getifaddrs(&ifaddr) == -1) { sd_err("getifaddrs failed: %m"); return -1; } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { struct sockaddr_in *sin; struct sockaddr_in6 *sin6; if (ifa->ifa_flags & IFF_LOOPBACK) continue; if (!ifa->ifa_addr) continue; switch (ifa->ifa_addr->sa_family) { case AF_INET: sin = (struct sockaddr_in *)ifa->ifa_addr; memset(bytes, 0, 12); memcpy(bytes + 12, &sin->sin_addr, 4); memcpy(bytes + 12, &sin->sin_addr, 4); sd_notice("found IPv4 address"); goto out; case AF_INET6: sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; memcpy(bytes, &sin6->sin6_addr, 16); sd_notice("found IPv6 address"); goto out; } } sd_err("no valid interface found"); ret = -1; out: freeifaddrs(ifaddr); return ret; } int create_unix_domain_socket(const char *unix_path, int (*callback)(int, void *), void *data) { int fd, ret; struct sockaddr_un addr; addr.sun_family = AF_UNIX; pstrcpy(addr.sun_path, sizeof(addr.sun_path), unix_path); fd = socket(addr.sun_family, SOCK_STREAM, 0); if (fd < 0) { sd_err("failed to create socket, %m"); return -1; } ret = bind(fd, &addr, sizeof(addr)); if (ret) { sd_err("failed to bind socket: %m"); goto err; } ret = listen(fd, SOMAXCONN); if (ret) { sd_err("failed to listen on socket: %m"); goto err; } ret = callback(fd, data); if (ret) goto err; return 0; err: close(fd); return -1; } bool inetaddr_is_valid(char *addr) { unsigned char buf[INET6_ADDRSTRLEN]; int af; af = strstr(addr, ":") ? AF_INET6 : AF_INET; if (!inet_pton(af, addr, buf)) { sd_err("Bad address '%s'", addr); return false; } return true; } int do_writev2(int fd, void *hdr, size_t hdr_len, void *body, size_t body_len) { struct iovec iov[2]; iov[0].iov_base = hdr; iov[0].iov_len = hdr_len; iov[1].iov_base = body; iov[1].iov_len = body_len; return writev(fd, iov, 2); } sheepdog-0.8.3/lib/option.c000066400000000000000000000047011237656255000155570ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "option.h" #include "logger.h" char *build_short_options(const struct sd_option *sd_opts) { static char sopts[256], *p; const struct sd_option *opt; p = sopts; sd_for_each_option(opt, sd_opts) { *p++ = opt->ch; if (opt->has_arg) *p++ = ':'; } *p = '\0'; return sopts; } struct option *build_long_options(const struct sd_option *sd_opts) { static struct option lopts[256], *p; const struct sd_option *opt; p = lopts; sd_for_each_option(opt, sd_opts) { p->name = opt->name; p->has_arg = opt->has_arg; p->flag = NULL; p->val = opt->ch; p++; } memset(p, 0, sizeof(struct option)); return lopts; } const char *option_get_help(const struct sd_option *sd_opts, int ch) { const struct sd_option *opt; sd_for_each_option(opt, sd_opts) { if (opt->ch == ch) return opt->help; } return NULL; } int option_parse_size(const char *value, uint64_t *ret) { char *postfix; double sizef; sizef = strtod(value, &postfix); if (postfix[0] != '\0' && postfix[1] != '\0') goto err; switch (*postfix) { case 'P': case 'p': sizef *= 1024; case 'T': case 't': sizef *= 1024; case 'G': case 'g': sizef *= 1024; case 'M': case 'm': sizef *= 1024; case 'K': case 'k': sizef *= 1024; case 'b': case '\0': *ret = (uint64_t) sizef; break; default: err: sd_err("Invalid size '%s'", value); sd_err("You may use k, M, G, T or P suffixes for " "kilobytes, megabytes, gigabytes, terabytes and petabytes."); return -1; } return 0; } int option_parse(char *arg, const char *delim, struct option_parser *parsers) { char *savep, *opt; struct option_parser *iter = NULL; opt = strtok_r(arg, delim, &savep); do { for (iter = parsers; iter->option; iter++) { int len = strlen(iter->option); if (!strncmp(iter->option, opt, len)) { if (iter->parser(opt + len) < 0) return -1; break; } } if (!iter->option) { sd_err("invalid option %s", opt); return -1; } } while ((opt = strtok_r(NULL, delim, &savep))); return 0; } sheepdog-0.8.3/lib/rbtree.c000066400000000000000000000203051237656255000155300ustar00rootroot00000000000000/* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include "rbtree.h" static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; struct rb_node *parent = rb_parent(node); node->rb_right = right->rb_left; if (node->rb_right) rb_set_parent(right->rb_left, node); right->rb_left = node; rb_set_parent(right, parent); if (parent) { if (node == parent->rb_left) parent->rb_left = right; else parent->rb_right = right; } else root->rb_node = right; rb_set_parent(node, right); } static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) { struct rb_node *left = node->rb_left; struct rb_node *parent = rb_parent(node); node->rb_left = left->rb_right; if (node->rb_left) rb_set_parent(left->rb_right, node); left->rb_right = node; rb_set_parent(left, parent); if (parent) { if (node == parent->rb_right) parent->rb_right = left; else parent->rb_left = left; } else root->rb_node = left; rb_set_parent(node, left); } void rb_insert_color(struct rb_node *node, struct rb_root *root) { struct rb_node *parent, *gparent; while ((parent = rb_parent(node)) && rb_is_red(parent)) { gparent = rb_parent(parent); if (parent == gparent->rb_left) { register struct rb_node *uncle = gparent->rb_right; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } if (parent->rb_right == node) { register struct rb_node *tmp; __rb_rotate_left(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_right(gparent, root); } else { register struct rb_node *uncle = gparent->rb_left; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } if (parent->rb_left == node) { register struct rb_node *tmp; __rb_rotate_right(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_left(gparent, root); } } rb_set_black(root->rb_node); } static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, struct rb_root *root) { struct rb_node *other; while ((!node || rb_is_black(node)) && node != root->rb_node) { if (parent->rb_left == node) { other = parent->rb_right; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_left(parent, root); other = parent->rb_right; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_right || rb_is_black(other->rb_right)) { rb_set_black(other->rb_left); rb_set_red(other); __rb_rotate_right(other, root); other = parent->rb_right; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_right); __rb_rotate_left(parent, root); node = root->rb_node; break; } } else { other = parent->rb_left; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_right(parent, root); other = parent->rb_left; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_left || rb_is_black(other->rb_left)) { rb_set_black(other->rb_right); rb_set_red(other); __rb_rotate_left(other, root); other = parent->rb_left; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_left); __rb_rotate_right(parent, root); node = root->rb_node; break; } } } if (node) rb_set_black(node); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child, *parent; int color; if (!node->rb_left) child = node->rb_right; else if (!node->rb_right) child = node->rb_left; else { struct rb_node *old = node, *left; node = node->rb_right; while ((left = node->rb_left) != NULL) node = left; if (rb_parent(old)) { if (rb_parent(old)->rb_left == old) rb_parent(old)->rb_left = node; else rb_parent(old)->rb_right = node; } else root->rb_node = node; child = node->rb_right; parent = rb_parent(node); color = rb_color(node); if (parent == old) parent = node; else { if (child) rb_set_parent(child, parent); parent->rb_left = child; node->rb_right = old->rb_right; rb_set_parent(old->rb_right, node); } node->rb_parent_color = old->rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); goto color; } parent = rb_parent(node); color = rb_color(node); if (child) rb_set_parent(child, parent); if (parent) { if (parent->rb_left == node) parent->rb_left = child; else parent->rb_right = child; } else root->rb_node = child; color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } /* This function returns the first node (in sort order) of the tree. */ struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a right-hand child, go down and then left as far * as we can. */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) node = node->rb_left; return (struct rb_node *)node; } /* * No right-hand children. Everything down and left is * smaller than us, so any 'next' node must be in the general * direction of our parent. Go up the tree; any time the * ancestor is a right-hand child of its parent, keep going * up. First time it's a left-hand child of its parent, said * parent is our 'next' node. */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; return parent; } struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a left-hand child, go down and then right as far * as we can. */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) node = node->rb_right; return (struct rb_node *)node; } /* * No left-hand children. Go up till we find an ancestor which * is a right-hand child of its parent */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; return parent; } void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ if (parent) { if (victim == parent->rb_left) parent->rb_left = new; else parent->rb_right = new; } else { root->rb_node = new; } if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; } sheepdog-0.8.3/lib/sd_inode.c000066400000000000000000000554661237656255000160510ustar00rootroot00000000000000/* * B-tree is a tree data structure that keeps data sorted and allows searches, * sequential access, insertions, and deletions in logarithmic time. * The B-tree is a generalization of a binary search tree in that a node can * have more than two children. (Comer 1979, p. 123) Unlike self-balancing * binary search trees, the B-tree is optimized for systems that read and * write large blocks of data. (ref: http://en.wikipedia.org/wiki/B-tree) * * In sheepdog, we use space in inode->data_vdi_id[] to store leaf-node at * beginning and store root-node of B-tree when it reach depths of two. * * At beginning, the inode->data_vdi_id[] is storing leaf-node which point * to data-obj directly: * * +------------------+-----------+-----------+--------+ * | sd_extent_header | sd_extent | sd_extent | ...... | * +------------------+-----------+-----------+--------+ * | | * / \ * / \ * / \ * +------------+ <------ ----> +------------+ * | data-obj 1 | | data-obj 2 | * +------------+ +------------+ * * After adding more oid into it, the leaf-node will be full of struct sd_extent * and should be splited to two leaf-nodes, after it, the inode->data_vdi_id[] * should become root-node which store sd_extent_idx and point to the two * leaf-nodes: * * +------------------+-----------------+-----------------+ * | sd_extent_header | sd_extent_idx | sd_extent_idx | * +------------------+-----------------+-----------------+ * | | * / \ * / ------------- * / \ * / \ * / \ * +------------------+-----------+-----------+--------+ +------------------+-----------+-----------+--------+ * | sd_extent_header | sd_extent | sd_extent | ...... | | sd_extent_header | sd_extent | sd_extent | ...... | * +------------------+-----------+-----------+--------+ +------------------+-----------+-----------+--------+ * / \ / \ * +------------+ <------ ---> +------------+ +--------------+ <-- --> +--------------+ * | data-obj 1 | | data-obj 2 | | data-obj 511 | | data-obj 512 | * +------------+ +------------+ +--------------+ +--------------+ * * When a leaf-node is full, we could add a new leaf-node and add a * new sd_extent_idx in root-node to point to it: * * +------------------+-----------------+-----------------+---------------+ * | sd_extent_header | sd_extent_idx | sd_extent_idx | sd_extent_idx | * +------------------+-----------------+-----------------+---------------+ * | | \ * / \ \ (new leaf-node) * / --------- ------ +------------------+-----------+--------+ * / \ | sd_extent_header | sd_extent | ...... | * / \ +------------------+-----------+--------+ * / \ * +------------------+-----------+--------+ +------------------+-----------+--------+ * | sd_extent_header | sd_extent | ...... | | sd_extent_header | sd_extent | ...... | * +------------------+-----------+--------+ +------------------+-----------+--------+ * * * As above, the root-node point to leaf-node which point to data-obj * (the implemention of B-tree in sd_inode only support two depth), so it could * store: * * (number of sd_extent_idx in root-node) * (number of sd_extent in leaf-node) * * which is 349524 * 524287 = 183250889388 data-objects (about 680 PB with 4MB data-objs). * */ #include #include "util.h" #include "sheepdog_proto.h" #define EXT_MAX_SPACE (SD_INODE_DATA_INDEX_SIZE - \ sizeof(struct sd_extent_header)) #define EXT_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent)) #define EXT_IDX_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent_idx)) #define EXT_HEADER(data) ((struct sd_extent_header *)(data)) #define FIRST_EXT(data) ((struct sd_extent *)((char *)(data) + \ sizeof(struct sd_extent_header))) #define LAST_EXT(data) (FIRST_EXT(data) + EXT_HEADER(data)->entries) #define OFFSET_EXT(data, n) ((char *)(data) + sizeof(struct sd_extent_header) \ + n * sizeof(struct sd_extent)) #define EXT_MAX_IDXS (EXT_MAX_SPACE / sizeof(struct sd_extent_idx)) #define FIRST_IDX(data) ((struct sd_extent_idx *)((char *)(data) + \ sizeof(struct sd_extent_header))) #define LAST_IDX(data) (FIRST_IDX(data) + EXT_HEADER(data)->entries) #define OFFSET_IDX(data, n) ((char *)(data) + sizeof(struct sd_extent_header) \ + n * sizeof(struct sd_extent_idx)) struct find_path { struct sd_extent_idx *p_idx; struct sd_extent *p_ext; struct sd_extent_header *p_ext_header; int depth; }; typedef int (*comp)(void *a, void *b); /* compare function for sd_extent */ static int extent_comp(void *a, void *b) { struct sd_extent *ea = (struct sd_extent *)a; struct sd_extent *eb = (struct sd_extent *)b; if (ea->idx > eb->idx) return 1; else if (ea->idx < eb->idx) return -1; else return 0; } /* compare function for sd_extent_idx */ static int index_comp(void *a, void *b) { struct sd_extent_idx *ia = (struct sd_extent_idx *)a; struct sd_extent_idx *ib = (struct sd_extent_idx *)b; if (ia->idx > ib->idx) return 1; else if (ia->idx < ib->idx) return -1; else return 0; } /* * traverse the whole btree that include all the inode->data_vdi_id, bnode, * data objects and call btree_cb_fn() */ void traverse_btree(read_node_fn reader, const struct sd_inode *inode, btree_cb_fn fn, void *arg) { struct sd_extent_header *header = EXT_HEADER(inode->data_vdi_id); struct sd_extent_header *leaf_node = NULL; struct sd_extent *last, *iter; struct sd_extent_idx *last_idx, *iter_idx; void *tmp; fn(header, BTREE_HEAD, arg); if (header->depth == 1) { last = LAST_EXT(inode->data_vdi_id); iter = FIRST_EXT(inode->data_vdi_id); while (iter != last) { fn(iter, BTREE_EXT, arg); iter++; } } else if (header->depth == 2) { last_idx = LAST_IDX(inode->data_vdi_id); iter_idx = FIRST_IDX(inode->data_vdi_id); leaf_node = xvalloc(SD_INODE_DATA_INDEX_SIZE); tmp = (void *)leaf_node; while (iter_idx != last_idx) { reader(iter_idx->oid, &tmp, SD_INODE_DATA_INDEX_SIZE, 0); fn(iter_idx, BTREE_IDX, arg); fn(leaf_node, BTREE_HEAD, arg); last = LAST_EXT(leaf_node); iter = FIRST_EXT(leaf_node); while (iter != last) { fn(iter, BTREE_EXT, arg); iter++; } iter_idx++; } free(leaf_node); } else panic("This B-tree not support depth %u", header->depth); } #ifdef DEBUG static void dump_cb(void *data, enum btree_node_type type, void *arg) { struct sd_extent_header *header; struct sd_extent *ext; struct sd_extent_idx *idx; switch (type) { case BTREE_HEAD: header = (struct sd_extent_header *)data; sd_info("btree> HEAD: magic %u entries %u depth %u", header->magic, header->entries, header->depth); break; case BTREE_EXT: ext = (struct sd_extent *)data; sd_info("btree> EXT: idx %u vdi_id %u", ext->idx, ext->vdi_id); break; case BTREE_IDX: idx = (struct sd_extent_idx *)data; sd_info("btree> IDX: idx %u oid %lu", idx->idx, idx->oid); break; } } #endif /* dump the information of B-tree */ static void dump_btree(read_node_fn reader, struct sd_inode *inode) { #ifdef DEBUG sd_info("btree> BEGIN"); traverse_btree(reader, inode, dump_cb, NULL); sd_info("btree> END"); #endif } /* * Search for the key in a B-tree node. If can't find it, return the position * for insert operation. So we can't just use xbsearch(). */ static void *binary_search(void *first, void *last, void *key, size_t obj_size, comp cmp) { const char *l, *r, *m; int ret; l = (const char *)first; r = (const char *)last - obj_size; while (l <= r) { m = l + ((r - l) / obj_size / 2) * obj_size; ret = cmp((void *)key, (void *)m); if (ret < 0) r = m - obj_size; else if (ret > 0) l = m + obj_size; else return (void *)m; } return (void *)l; } void sd_inode_init(void *data, int depth) { struct sd_extent_header *header = EXT_HEADER(data); header->magic = INODE_BTREE_MAGIC; header->depth = depth; header->entries = 0; } /* check whether ext is in this node */ static bool ext_in_range(struct sd_extent_header *header, struct sd_extent *ext) { struct sd_extent *last = LAST_EXT(header); if (last - ext > 0) return true; return false; } /* check whether idx is in this node */ static bool idx_in_range(struct sd_extent_header *header, struct sd_extent_idx *idx) { struct sd_extent_idx *last = LAST_IDX(header); if (last - idx > 0) return true; return false; } /* search idx in leaf-node */ static struct sd_extent *search_ext_entry(struct sd_extent_header *header, uint32_t idx) { struct sd_extent tmp; tmp.idx = idx; return binary_search(FIRST_EXT(header), LAST_EXT(header), &tmp, sizeof(struct sd_extent), extent_comp); } /* search idx in middle-node */ static struct sd_extent_idx *search_idx_entry(struct sd_extent_header *header, uint32_t idx) { struct sd_extent_idx tmp; tmp.idx = idx; return binary_search(FIRST_IDX(header), LAST_IDX(header), &tmp, sizeof(struct sd_extent_idx), index_comp); } static void insert_ext_entry_nosearch(struct sd_extent_header *header, struct sd_extent *ext, uint32_t idx, uint32_t vdi_id) { struct sd_extent *last = LAST_EXT(header); memmove(ext + 1, ext, (last - ext) * sizeof(struct sd_extent)); ext->idx = idx; ext->vdi_id = vdi_id; header->entries++; } static void insert_idx_entry_nosearch(struct sd_extent_header *header, struct sd_extent_idx *idx_ext, uint32_t idx, uint64_t oid) { struct sd_extent_idx *last = LAST_IDX(header); memmove(idx_ext + 1, idx_ext, (last - idx_ext) * sizeof(struct sd_extent_idx)); idx_ext->idx = idx; idx_ext->oid = oid; header->entries++; } static void insert_idx_entry(struct sd_extent_header *header, uint32_t idx, uint64_t oid) { struct sd_extent_idx *found; if (header->entries >= EXT_MAX_IDXS) goto out; if (!header->entries) { FIRST_IDX(header)->idx = idx; FIRST_IDX(header)->oid = oid; header->entries++; goto out; } found = search_idx_entry(header, idx); insert_idx_entry_nosearch(header, found, idx, oid); out: return; } static void split_to_nodes(struct sd_extent_header *src, struct sd_extent_header *left, struct sd_extent_header *right, int num) { memcpy(left, src, sizeof(struct sd_extent_header) + num * sizeof(struct sd_extent)); left->entries = num; mempcpy(right, src, sizeof(struct sd_extent_header)); mempcpy(FIRST_EXT(right), OFFSET_EXT(src, num), (src->entries - num) * sizeof(struct sd_extent)); right->entries = src->entries - num; } /* * The meta-data in inode is leaf-node at beginning, but after inserting too * much sd_extent it will be full. When sd_extents is full, we need to create * two new nodes, move sd_extents from inode to them and finally, let inode * point to them. */ static void transfer_to_idx_root(write_node_fn writer, struct sd_inode *inode) { struct sd_extent_header *left; struct sd_extent_header *right; struct sd_extent_header *root = EXT_HEADER(inode->data_vdi_id); uint64_t left_oid, right_oid; uint32_t num = root->entries / 2; /* create two leaf-node and copy the entries from root-node */ left = xvalloc(SD_INODE_DATA_INDEX_SIZE); right = xvalloc(SD_INODE_DATA_INDEX_SIZE); split_to_nodes(root, left, right, num); /* write two nodes back */ left_oid = vid_to_btree_oid(inode->vdi_id, inode->btree_counter++); right_oid = vid_to_btree_oid(inode->vdi_id, inode->btree_counter++); writer(left_oid, left, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, true, false); writer(right_oid, right, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, true, false); /* change root from ext-node to idx-node */ root->entries = 0; root->depth = 2; insert_idx_entry(root, (LAST_EXT(left) - 1)->idx, left_oid); insert_idx_entry(root, (LAST_EXT(right) - 1)->idx, right_oid); free(left); free(right); } /* * Search whole btree for 'idx'. * Return available position (could insert new sd_extent) if can't find 'idx'. */ static int search_whole_btree(read_node_fn reader, const struct sd_inode *inode, uint32_t idx, struct find_path *path) { struct sd_extent_header *header, *leaf_node; void *tmp; uint64_t oid; int ret = SD_RES_BTREE_NOT_FOUND; header = EXT_HEADER(inode->data_vdi_id); /* root is idx-node */ if (header->depth == 2) { path->depth = 2; path->p_idx = search_idx_entry(header, idx); leaf_node = xvalloc(SD_INODE_DATA_INDEX_SIZE); tmp = (void *)leaf_node; if (idx_in_range(header, path->p_idx)) { oid = path->p_idx->oid; ret = reader(oid, &tmp, SD_INODE_DATA_INDEX_SIZE, 0); if (ret != SD_RES_SUCCESS) goto out; path->p_ext = search_ext_entry(leaf_node, idx); path->p_ext_header = leaf_node; if (ext_in_range(leaf_node, path->p_ext) && path->p_ext->idx == idx) ret = SD_RES_BTREE_FOUND; } else { /* check if last idx-node has space */ oid = (path->p_idx - 1)->oid; ret = reader(oid, &tmp, SD_INODE_DATA_INDEX_SIZE, 0); if (ret != SD_RES_SUCCESS) goto out; if (leaf_node->entries < EXT_MAX_ENTRIES) { path->p_ext = search_ext_entry(leaf_node, idx); path->p_ext_header = leaf_node; } } } else if (header->depth == 1) { path->depth = 1; path->p_ext = search_ext_entry(header, idx); if (ext_in_range(header, path->p_ext) && path->p_ext->idx == idx) ret = SD_RES_BTREE_FOUND; else ret = SD_RES_BTREE_NOT_FOUND; } out: return ret; } uint32_t sd_inode_get_vid(read_node_fn reader, const struct sd_inode *inode, uint32_t idx) { struct find_path path; int ret; if (inode->store_policy == 0) return inode->data_vdi_id[idx]; else { /* btree is not init, so vdi is 0 */ if (inode->data_vdi_id[0] == 0) return 0; memset(&path, 0, sizeof(path)); ret = search_whole_btree(reader, inode, idx, &path); if (ret == SD_RES_BTREE_FOUND) return path.p_ext->vdi_id; if (path.p_ext_header) free(path.p_ext_header); } return 0; } /* * When the leaf-node is full, we need to create a new node and * move half of the data into new one. */ static void split_ext_node(write_node_fn writer, struct sd_inode *inode, struct find_path *path) { struct sd_extent_header *old = path->p_ext_header, *new_ext; uint32_t num = old->entries / 2; uint64_t new_oid; new_ext = xvalloc(SD_INODE_DATA_INDEX_SIZE); split_to_nodes(old, new_ext, old, num); new_oid = vid_to_btree_oid(inode->vdi_id, inode->btree_counter++); writer(new_oid, new_ext, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, true, false); writer(path->p_idx->oid, old, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, false, false); /* write new index */ insert_idx_entry(EXT_HEADER(inode->data_vdi_id), LAST_EXT(new_ext)->idx, new_oid); free(new_ext); } /* * Add new 'idx' and 'vdi_id' pair into leaf-node if depth equal 1 and * add new leaf-node if there is no room for new 'idx' and 'vdi_id' pair. */ static int insert_new_node(write_node_fn writer, read_node_fn reader, struct sd_inode *inode, struct find_path *path, uint32_t idx, uint32_t vdi_id) { struct sd_extent_header *header = EXT_HEADER(inode->data_vdi_id); struct sd_extent_header *leaf_node = NULL; uint64_t oid; int ret = SD_RES_SUCCESS; if (path->depth == 1) { if (header->entries >= EXT_MAX_ENTRIES) { transfer_to_idx_root(writer, inode); ret = SD_RES_BTREE_REPEAT; goto out; } insert_ext_entry_nosearch(header, path->p_ext, idx, vdi_id); } else if (path->depth == 2) { if (idx_in_range(header, path->p_idx)) { if (!path->p_ext_header) { ret = SD_RES_BTREE_NOT_FOUND; goto out; } if (path->p_ext_header->entries >= EXT_MAX_ENTRIES) { split_ext_node(writer, inode, path); ret = SD_RES_BTREE_REPEAT; goto out; } insert_ext_entry_nosearch(path->p_ext_header, path->p_ext, idx, vdi_id); writer(path->p_idx->oid, path->p_ext_header, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, false, false); } else if (path->p_ext_header) { /* the last idx-node */ insert_ext_entry_nosearch(path->p_ext_header, path->p_ext, idx, vdi_id); path->p_idx--; path->p_idx->idx = (LAST_EXT(path->p_ext_header) - 1)->idx; writer(path->p_idx->oid, path->p_ext_header, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, false, false); } else { /* if btree is full, then panic */ if (header->entries >= EXT_IDX_MAX_ENTRIES) panic("%s() B-tree is full!", __func__); /* create a new ext-node */ leaf_node = xvalloc(SD_INODE_DATA_INDEX_SIZE); sd_inode_init(leaf_node, 2); oid = vid_to_btree_oid(inode->vdi_id, inode->btree_counter++); insert_ext_entry_nosearch(leaf_node, FIRST_EXT(leaf_node), idx, vdi_id); writer(oid, leaf_node, SD_INODE_DATA_INDEX_SIZE, 0, 0, inode->nr_copies, inode->copy_policy, true, false); insert_idx_entry_nosearch(header, path->p_idx, idx, oid); } } out: if (leaf_node) free(leaf_node); return ret; } void sd_inode_set_vid(write_node_fn writer, read_node_fn reader, struct sd_inode *inode, uint32_t idx, uint32_t vdi_id) { struct sd_extent_header *header; struct find_path path; uint64_t offset; int ret; path.p_ext_header = NULL; if (inode->store_policy == 0) inode->data_vdi_id[idx] = vdi_id; else { if (inode->data_vdi_id[0] == 0) sd_inode_init(inode->data_vdi_id, 1); header = EXT_HEADER(inode->data_vdi_id); if (header->magic != INODE_BTREE_MAGIC) panic("%s() B-tree in inode is corrupt!", __func__); while (1) { memset(&path, 0, sizeof(path)); ret = search_whole_btree(reader, inode, idx, &path); if (ret == SD_RES_BTREE_FOUND) { path.p_ext->vdi_id = vdi_id; /* * Only write the vdi_id in sd_extent for * second level leaf-node. */ if (!path.p_ext_header) goto out; offset = (unsigned char *)(path.p_ext) - (unsigned char *)(path.p_ext_header) + offsetof(struct sd_extent, vdi_id); writer(path.p_idx->oid, &vdi_id, sizeof(vdi_id), offset, 0, inode->nr_copies, inode->copy_policy, false, false); goto out; } else { ret = insert_new_node(writer, reader, inode, &path, idx, vdi_id); if (SD_RES_BTREE_REPEAT == ret) { if (path.p_ext_header) free(path.p_ext_header); continue; } else goto out; } } } out: if (path.p_ext_header) free(path.p_ext_header); if (inode->store_policy != 0) dump_btree(reader, inode); } /* * Return the size of meta-data in inode->data_vdi_id. When leaf-node of B-tree * is not full, we don't need to read out all sizeof(sd_inode). * The argument of 'size' is just for compatibility of parse_vdi(). */ uint32_t sd_inode_get_meta_size(struct sd_inode *inode, size_t size) { struct sd_extent_header *header; uint32_t len; if (inode->store_policy == 0) { len = count_data_objs(inode) * sizeof(inode->data_vdi_id[0]); if (len > size - SD_INODE_HEADER_SIZE - sizeof(uint32_t)) len = size - SD_INODE_HEADER_SIZE - sizeof(uint32_t); } else { header = EXT_HEADER(inode->data_vdi_id); len = sizeof(struct sd_extent_header); if (header->depth == 1) len += sizeof(struct sd_extent) * header->entries; else if (header->depth == 2) len += sizeof(struct sd_extent_idx) * header->entries; else panic("Depth of B-tree is out of range(depth: %u)", header->depth); } return len; } /* Write the whole meta-data of inode out */ int sd_inode_write(write_node_fn writer, struct sd_inode *inode, int flags, bool create, bool direct) { uint32_t len; int ret; if (inode->store_policy == 0) ret = writer(vid_to_vdi_oid(inode->vdi_id), inode, SD_INODE_HEADER_SIZE, 0, flags, inode->nr_copies, inode->copy_policy, create, direct); else { len = SD_INODE_HEADER_SIZE + sd_inode_get_meta_size(inode, 0); ret = writer(vid_to_vdi_oid(inode->vdi_id), inode, len, 0, flags, inode->nr_copies, inode->copy_policy, create, false); if (ret != SD_RES_SUCCESS) goto out; ret = writer(vid_to_vdi_oid(inode->vdi_id), inode, sizeof(uint32_t), offsetof(struct sd_inode, btree_counter), flags, inode->nr_copies, inode->copy_policy, create, false); } out: return ret; } /* Write the meta-data of inode out */ int sd_inode_write_vid(write_node_fn writer, struct sd_inode *inode, uint32_t idx, uint32_t vid, uint32_t value, int flags, bool create, bool direct) { int ret = SD_RES_SUCCESS; if (inode->store_policy == 0) ret = writer(vid_to_vdi_oid(vid), &value, sizeof(value), SD_INODE_HEADER_SIZE + sizeof(value) * idx, flags, inode->nr_copies, inode->copy_policy, create, direct); else { /* * For btree type sd_inode, we only have to write all * meta-data of sd_inode out. */ ret = sd_inode_write(writer, inode, flags, create, direct); } return ret; } void sd_inode_copy_vdis(write_node_fn writer, read_node_fn reader, uint32_t *data_vdi_id, uint8_t store_policy, uint8_t nr_copies, uint8_t copy_policy, struct sd_inode *newi) { struct sd_extent_header *header = EXT_HEADER(data_vdi_id); struct sd_extent_header *leaf_node; struct sd_extent_idx *last_idx, *old_iter_idx, *new_iter_idx; uint64_t oid; void *tmp; memcpy(newi->data_vdi_id, data_vdi_id, sizeof(newi->data_vdi_id)); if (store_policy == 1 && header->depth > 1) { /* for B-tree (> 1 level), it needs to copy all leaf-node */ last_idx = LAST_IDX(data_vdi_id); old_iter_idx = FIRST_IDX(data_vdi_id); new_iter_idx = FIRST_IDX(newi->data_vdi_id); leaf_node = xvalloc(SD_INODE_DATA_INDEX_SIZE); tmp = (void *)leaf_node; while (old_iter_idx != last_idx) { reader(old_iter_idx->oid, &tmp, SD_INODE_DATA_INDEX_SIZE, 0); oid = vid_to_btree_oid(newi->vdi_id, newi->btree_counter++); writer(oid, leaf_node, SD_INODE_DATA_INDEX_SIZE, 0, 0, nr_copies, copy_policy, true, false); new_iter_idx->oid = oid; old_iter_idx++; new_iter_idx++; } free(leaf_node); } } sheepdog-0.8.3/lib/sha1.c000066400000000000000000000233601237656255000151050ustar00rootroot00000000000000/* * Cryptographic API. * * SHA1 Secure Hash Algorithm. * * Derived from cryptoapi implementation, adapted for in-place * scatterlist interface. Originally based on the public domain * implementation written by Steve Reid. * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald * Copyright (c) Jean-Francois Dive * * Add x86 hardware acceleration by Liu Yuan * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include #include "sha1.h" #include "util.h" #define SHA1_H0 0x67452301UL #define SHA1_H1 0xefcdab89UL #define SHA1_H2 0x98badcfeUL #define SHA1_H3 0x10325476UL #define SHA1_H4 0xc3d2e1f0UL sha1_init_func_t sha1_init; sha1_update_func_t sha1_update; sha1_final_func_t sha1_final; static __always_inline uint32_t rol(uint32_t value, uint32_t bits) { return (value << bits) | (value >> (32 - bits)); } /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ # define blk0(i) block32[i] #define blk(i) \ (block32[i & 15] = rol(block32[(i + 13) & 15] ^ block32[(i + 8) & 15] \ ^ block32[(i + 2) & 15] ^ block32[i & 15], 1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R1(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R2(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ w = rol(w, 30); #define R3(v, w, x, y, z, i) \ z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ w = rol(w, 30); #define R4(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ w = rol(w, 30); /* Hash a single 512-bit block. This is the core of the algorithm. */ static void sha1_transform(uint32_t *state, const uint8_t *in) { uint32_t a, b, c, d, e; uint32_t block32[16]; /* convert/copy data to workspace */ for (a = 0; a < sizeof(block32)/sizeof(uint32_t); a++) block32[a] = ntohl(((const uint32_t *)in)[a]); /* Copy context->state[] to working vars */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a, b, c, d, e, 0); R0(e, a, b, c, d, 1); R0(d, e, a, b, c, 2); R0(c, d, e, a, b, 3); R0(b, c, d, e, a, 4); R0(a, b, c, d, e, 5); R0(e, a, b, c, d, 6); R0(d, e, a, b, c, 7); R0(c, d, e, a, b, 8); R0(b, c, d, e, a, 9); R0(a, b, c, d, e, 10); R0(e, a, b, c, d, 11); R0(d, e, a, b, c, 12); R0(c, d, e, a, b, 13); R0(b, c, d, e, a, 14); R0(a, b, c, d, e, 15); R1(e, a, b, c, d, 16); R1(d, e, a, b, c, 17); R1(c, d, e, a, b, 18); R1(b, c, d, e, a, 19); R2(a, b, c, d, e, 20); R2(e, a, b, c, d, 21); R2(d, e, a, b, c, 22); R2(c, d, e, a, b, 23); R2(b, c, d, e, a, 24); R2(a, b, c, d, e, 25); R2(e, a, b, c, d, 26); R2(d, e, a, b, c, 27); R2(c, d, e, a, b, 28); R2(b, c, d, e, a, 29); R2(a, b, c, d, e, 30); R2(e, a, b, c, d, 31); R2(d, e, a, b, c, 32); R2(c, d, e, a, b, 33); R2(b, c, d, e, a, 34); R2(a, b, c, d, e, 35); R2(e, a, b, c, d, 36); R2(d, e, a, b, c, 37); R2(c, d, e, a, b, 38); R2(b, c, d, e, a, 39); R3(a, b, c, d, e, 40); R3(e, a, b, c, d, 41); R3(d, e, a, b, c, 42); R3(c, d, e, a, b, 43); R3(b, c, d, e, a, 44); R3(a, b, c, d, e, 45); R3(e, a, b, c, d, 46); R3(d, e, a, b, c, 47); R3(c, d, e, a, b, 48); R3(b, c, d, e, a, 49); R3(a, b, c, d, e, 50); R3(e, a, b, c, d, 51); R3(d, e, a, b, c, 52); R3(c, d, e, a, b, 53); R3(b, c, d, e, a, 54); R3(a, b, c, d, e, 55); R3(e, a, b, c, d, 56); R3(d, e, a, b, c, 57); R3(c, d, e, a, b, 58); R3(b, c, d, e, a, 59); R4(a, b, c, d, e, 60); R4(e, a, b, c, d, 61); R4(d, e, a, b, c, 62); R4(c, d, e, a, b, 63); R4(b, c, d, e, a, 64); R4(a, b, c, d, e, 65); R4(e, a, b, c, d, 66); R4(d, e, a, b, c, 67); R4(c, d, e, a, b, 68); R4(b, c, d, e, a, 69); R4(a, b, c, d, e, 70); R4(e, a, b, c, d, 71); R4(d, e, a, b, c, 72); R4(c, d, e, a, b, 73); R4(b, c, d, e, a, 74); R4(a, b, c, d, e, 75); R4(e, a, b, c, d, 76); R4(d, e, a, b, c, 77); R4(c, d, e, a, b, 78); R4(b, c, d, e, a, 79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; /* Wipe variables */ a = b = c = d = e = 0; memset(block32, 0x00, sizeof block32); } static void generic_sha1_init(void *ctx) { struct sha1_ctx *sctx = ctx; *sctx = (struct sha1_ctx){ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, }; } static void generic_sha1_update(void *ctx, const uint8_t *data, unsigned int len) { struct sha1_ctx *sctx = ctx; unsigned int i, j; j = (sctx->count >> 3) & 0x3f; sctx->count += len << 3; if ((j + len) > 63) { memcpy(&sctx->buffer[j], data, (i = 64-j)); sha1_transform(sctx->state, sctx->buffer); for ( ; i + 63 < len; i += 64) sha1_transform(sctx->state, &data[i]); j = 0; } else i = 0; memcpy(&sctx->buffer[j], &data[i], len - i); } /* Add padding and return the message digest. */ static void generic_sha1_final(void *ctx, uint8_t *out) { struct sha1_ctx *sctx = ctx; uint32_t i, j, idx, padlen; uint64_t t; uint8_t bits[8] = { 0, }; static const uint8_t padding[64] = { 0x80, }; t = sctx->count; bits[7] = 0xff & t; t >>= 8; bits[6] = 0xff & t; t >>= 8; bits[5] = 0xff & t; t >>= 8; bits[4] = 0xff & t; t >>= 8; bits[3] = 0xff & t; t >>= 8; bits[2] = 0xff & t; t >>= 8; bits[1] = 0xff & t; t >>= 8; bits[0] = 0xff & t; /* Pad out to 56 mod 64 */ idx = (sctx->count >> 3) & 0x3f; padlen = (idx < 56) ? (56 - idx) : ((64+56) - idx); generic_sha1_update(sctx, padding, padlen); /* Append length */ generic_sha1_update(sctx, bits, sizeof bits); /* Store state in digest */ for (i = j = 0; i < 5; i++, j += 4) { uint32_t t2 = sctx->state[i]; out[j+3] = t2 & 0xff; t2 >>= 8; out[j+2] = t2 & 0xff; t2 >>= 8; out[j+1] = t2 & 0xff; t2 >>= 8; out[j] = t2 & 0xff; } /* Wipe context */ memset(sctx, 0, sizeof *sctx); } #ifdef __x86_64__ static asmlinkage void (*sha1_transform_asm)(uint32_t *, const uint8_t *, unsigned int); asmlinkage void sha1_transform_ssse3(uint32_t *, const uint8_t *, unsigned int); asmlinkage void sha1_transform_avx(uint32_t *, const uint8_t *, unsigned int); static void do_ssse3_sha1_update(struct sha1_ctx *ctx, const uint8_t *data, unsigned int len, unsigned int partial) { struct sha1_ctx *sctx = ctx; unsigned int done = 0; sctx->count += len; if (partial) { done = SHA1_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, done); sha1_transform_asm(sctx->state, sctx->buffer, 1); } if (len - done >= SHA1_BLOCK_SIZE) { const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; sha1_transform_asm(sctx->state, data + done, rounds); done += rounds * SHA1_BLOCK_SIZE; } memcpy(sctx->buffer, data + done, len - done); return; } static void ssse3_sha1_update(void *ctx, const uint8_t *data, unsigned int len) { struct sha1_ctx *sctx = ctx; unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; /* Handle the fast case right here */ if (partial + len < SHA1_BLOCK_SIZE) { sctx->count += len; memcpy(sctx->buffer + partial, data, len); return; } do_ssse3_sha1_update(ctx, data, len, partial); } /* Add padding and return the message digest. */ static void ssse3_sha1_final(void *ctx, uint8_t *out) { struct sha1_ctx *sctx = ctx; unsigned int i, j, idx, padlen; uint64_t t; uint8_t bits[8] = { 0, }; static const uint8_t padding[SHA1_BLOCK_SIZE] = { 0x80, }; t = sctx->count << 3; bits[7] = 0xff & t; t >>= 8; bits[6] = 0xff & t; t >>= 8; bits[5] = 0xff & t; t >>= 8; bits[4] = 0xff & t; t >>= 8; bits[3] = 0xff & t; t >>= 8; bits[2] = 0xff & t; t >>= 8; bits[1] = 0xff & t; t >>= 8; bits[0] = 0xff & t; /* Pad out to 56 mod 64 and append length */ idx = sctx->count % SHA1_BLOCK_SIZE; padlen = (idx < 56) ? (56 - idx) : ((SHA1_BLOCK_SIZE+56) - idx); /* We need to fill a whole block for do_ssse3_sha1_update() */ if (padlen <= 56) { sctx->count += padlen; memcpy(sctx->buffer + idx, padding, padlen); } else { do_ssse3_sha1_update(ctx, padding, padlen, idx); } do_ssse3_sha1_update(ctx, (const uint8_t *)&bits, sizeof(bits), 56); /* Store state in digest */ for (i = j = 0; i < 5; i++, j += 4) { uint32_t t2 = sctx->state[i]; out[j+3] = t2 & 0xff; t2 >>= 8; out[j+2] = t2 & 0xff; t2 >>= 8; out[j+1] = t2 & 0xff; t2 >>= 8; out[j] = t2 & 0xff; } /* Wipe context */ memset(sctx, 0, sizeof(*sctx)); return; } static bool avx_usable(void) { uint64_t xcr0; if (!cpu_has_avx || !cpu_has_osxsave) return false; xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) return false; return true; } #endif const char *sha1_to_hex(const unsigned char *sha1) { static __thread char buffer[50]; static const char hex[] = "0123456789abcdef"; char *buf = buffer; int i; for (i = 0; i < SHA1_DIGEST_SIZE; i++) { unsigned int val = *sha1++; *buf++ = hex[val >> 4]; *buf++ = hex[val & 0xf]; } return buffer; } void get_buffer_sha1(unsigned char *buf, unsigned len, unsigned char *sha1) { struct sha1_ctx c; sha1_init(&c); sha1_update(&c, buf, len); sha1_final(&c, sha1); } static void __attribute__((constructor)) __sha1_init(void) { sha1_init = generic_sha1_init; sha1_update = generic_sha1_update; sha1_final = generic_sha1_final; #ifdef __x86_64__ if (cpu_has_ssse3) sha1_transform_asm = sha1_transform_ssse3; else return; if (avx_usable()) sha1_transform_asm = sha1_transform_avx; sha1_update = ssse3_sha1_update; sha1_final = ssse3_sha1_final; #endif } sheepdog-0.8.3/lib/sha1_ssse3.S000066400000000000000000000271151237656255000162070ustar00rootroot00000000000000/* * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental * SSE3 instruction set extensions introduced in Intel Core Microarchitecture * processors. CPUs supporting Intel(R) AVX extensions will get an additional * boost. * * This work was inspired by the vectorized implementation of Dean Gaudet. * Additional information on it can be found at: * http://www.arctic.org/~dean/crypto/sha1.html * * It was improved upon with more efficient vectorization of the message * scheduling. This implementation has also been optimized for all current and * several future generations of Intel CPUs. * * See this article for more information about the implementation details: * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ * * Copyright (C) 2010, Intel Corp. * Authors: Maxim Locktyukhin * Ronen Zohar * * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: * Author: Mathias Krause * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #define _ALIGN_TEXT .align 16, 0x90 #define ENTRY(x) \ .text; _ALIGN_TEXT; .globl x; .type x,@function; x: #define END(name) \ .size name, .-name #define ENDPROC(name) \ .type name, @function; \ END(name) #define CTX %rdi // arg1 #define BUF %rsi // arg2 #define CNT %rdx // arg3 #define REG_A %ecx #define REG_B %esi #define REG_C %edi #define REG_D %ebp #define REG_E %edx #define REG_T1 %eax #define REG_T2 %ebx #define K_BASE %r8 #define HASH_PTR %r9 #define BUFFER_PTR %r10 #define BUFFER_END %r11 #define W_TMP1 %xmm0 #define W_TMP2 %xmm9 #define W0 %xmm1 #define W4 %xmm2 #define W8 %xmm3 #define W12 %xmm4 #define W16 %xmm5 #define W20 %xmm6 #define W24 %xmm7 #define W28 %xmm8 #define XMM_SHUFB_BSWAP %xmm10 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ #define WK(t) (((t) & 15) * 4)(%rsp) #define W_PRECALC_AHEAD 16 /* * This macro implements the SHA-1 function's body for single 64-byte block * param: function's name */ .macro SHA1_VECTOR_ASM name ENTRY(\name) push %rbx push %rbp push %r12 mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack mov CTX, HASH_PTR mov BUF, BUFFER_PTR shl $6, CNT # multiply by 64 add BUF, CNT mov CNT, BUFFER_END lea K_XMM_AR(%rip), K_BASE xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP SHA1_PIPELINED_MAIN_BODY # cleanup workspace mov $8, %ecx mov %rsp, %rdi xor %rax, %rax rep stosq mov %r12, %rsp # deallocate workspace pop %r12 pop %rbp pop %rbx ret ENDPROC(\name) .endm /* * This macro implements 80 rounds of SHA-1 for one 64-byte block */ .macro SHA1_PIPELINED_MAIN_BODY INIT_REGALLOC mov (HASH_PTR), A mov 4(HASH_PTR), B mov 8(HASH_PTR), C mov 12(HASH_PTR), D mov 16(HASH_PTR), E .set i, 0 .rept W_PRECALC_AHEAD W_PRECALC i .set i, (i+1) .endr .align 4 1: RR F1,A,B,C,D,E,0 RR F1,D,E,A,B,C,2 RR F1,B,C,D,E,A,4 RR F1,E,A,B,C,D,6 RR F1,C,D,E,A,B,8 RR F1,A,B,C,D,E,10 RR F1,D,E,A,B,C,12 RR F1,B,C,D,E,A,14 RR F1,E,A,B,C,D,16 RR F1,C,D,E,A,B,18 RR F2,A,B,C,D,E,20 RR F2,D,E,A,B,C,22 RR F2,B,C,D,E,A,24 RR F2,E,A,B,C,D,26 RR F2,C,D,E,A,B,28 RR F2,A,B,C,D,E,30 RR F2,D,E,A,B,C,32 RR F2,B,C,D,E,A,34 RR F2,E,A,B,C,D,36 RR F2,C,D,E,A,B,38 RR F3,A,B,C,D,E,40 RR F3,D,E,A,B,C,42 RR F3,B,C,D,E,A,44 RR F3,E,A,B,C,D,46 RR F3,C,D,E,A,B,48 RR F3,A,B,C,D,E,50 RR F3,D,E,A,B,C,52 RR F3,B,C,D,E,A,54 RR F3,E,A,B,C,D,56 RR F3,C,D,E,A,B,58 add $64, BUFFER_PTR # move to the next 64-byte block cmp BUFFER_END, BUFFER_PTR # if the current is the last one use cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun RR F4,A,B,C,D,E,60 RR F4,D,E,A,B,C,62 RR F4,B,C,D,E,A,64 RR F4,E,A,B,C,D,66 RR F4,C,D,E,A,B,68 RR F4,A,B,C,D,E,70 RR F4,D,E,A,B,C,72 RR F4,B,C,D,E,A,74 RR F4,E,A,B,C,D,76 RR F4,C,D,E,A,B,78 UPDATE_HASH (HASH_PTR), A UPDATE_HASH 4(HASH_PTR), B UPDATE_HASH 8(HASH_PTR), C UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E RESTORE_RENAMED_REGS cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end jne 1b .endm .macro INIT_REGALLOC .set A, REG_A .set B, REG_B .set C, REG_C .set D, REG_D .set E, REG_E .set T1, REG_T1 .set T2, REG_T2 .endm .macro RESTORE_RENAMED_REGS # order is important (REG_C is where it should be) mov B, REG_B mov D, REG_D mov A, REG_A mov E, REG_E .endm .macro SWAP_REG_NAMES a, b .set _T, \a .set \a, \b .set \b, _T .endm .macro F1 b, c, d mov \c, T1 SWAP_REG_NAMES \c, T1 xor \d, T1 and \b, T1 xor \d, T1 .endm .macro F2 b, c, d mov \d, T1 SWAP_REG_NAMES \d, T1 xor \c, T1 xor \b, T1 .endm .macro F3 b, c ,d mov \c, T1 SWAP_REG_NAMES \c, T1 mov \b, T2 or \b, T1 and \c, T2 and \d, T1 or T2, T1 .endm .macro F4 b, c, d F2 \b, \c, \d .endm .macro UPDATE_HASH hash, val add \hash, \val mov \val, \hash .endm /* * RR does two rounds of SHA-1 back to back with W[] pre-calc * t1 = F(b, c, d); e += w(i) * e += t1; b <<= 30; d += w(i+1); * t1 = F(a, b, c); * d += t1; a <<= 5; * e += a; * t1 = e; a >>= 7; * t1 <<= 5; * d += t1; */ .macro RR F, a, b, c, d, e, round add WK(\round), \e \F \b, \c, \d # t1 = F(b, c, d); W_PRECALC (\round + W_PRECALC_AHEAD) rol $30, \b add T1, \e add WK(\round + 1), \d \F \a, \b, \c W_PRECALC (\round + W_PRECALC_AHEAD + 1) rol $5, \a add \a, \e add T1, \d ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) .set i, ((\r) % 80) # pre-compute for the next iteration .if (i == 0) W_PRECALC_RESET .endif W_PRECALC_00_15 .elseif (i<32) W_PRECALC_16_31 .elseif (i < 80) // rounds 32-79 W_PRECALC_32_79 .endif .endm .macro W_PRECALC_RESET .set W, W0 .set W_minus_04, W4 .set W_minus_08, W8 .set W_minus_12, W12 .set W_minus_16, W16 .set W_minus_20, W20 .set W_minus_24, W24 .set W_minus_28, W28 .set W_minus_32, W .endm .macro W_PRECALC_ROTATE .set W_minus_32, W_minus_28 .set W_minus_28, W_minus_24 .set W_minus_24, W_minus_20 .set W_minus_20, W_minus_16 .set W_minus_16, W_minus_12 .set W_minus_12, W_minus_08 .set W_minus_08, W_minus_04 .set W_minus_04, W .set W, W_minus_32 .endm .macro W_PRECALC_SSSE3 .macro W_PRECALC_00_15 W_PRECALC_00_15_SSSE3 .endm .macro W_PRECALC_16_31 W_PRECALC_16_31_SSSE3 .endm .macro W_PRECALC_32_79 W_PRECALC_32_79_SSSE3 .endm /* message scheduling pre-compute for rounds 0-15 */ .macro W_PRECALC_00_15_SSSE3 .if ((i & 3) == 0) movdqu (i*4)(BUFFER_PTR), W_TMP1 .elseif ((i & 3) == 1) pshufb XMM_SHUFB_BSWAP, W_TMP1 movdqa W_TMP1, W .elseif ((i & 3) == 2) paddd (K_BASE), W_TMP1 .elseif ((i & 3) == 3) movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm /* message scheduling pre-compute for rounds 16-31 * * - calculating last 32 w[i] values in 8 XMM registers * - pre-calculate K+w[i] values and store to mem, for later load by ALU add * instruction * * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] * dependency, but improves for 32-79 */ .macro W_PRECALC_16_31_SSSE3 # blended scheduling of vector and scalar instruction streams, one 4-wide # vector iteration / 4 scalar rounds .if ((i & 3) == 0) movdqa W_minus_12, W palignr $8, W_minus_16, W # w[i-14] movdqa W_minus_04, W_TMP1 psrldq $4, W_TMP1 # w[i-3] pxor W_minus_08, W .elseif ((i & 3) == 1) pxor W_minus_16, W_TMP1 pxor W_TMP1, W movdqa W, W_TMP2 movdqa W, W_TMP1 pslldq $12, W_TMP2 .elseif ((i & 3) == 2) psrld $31, W pslld $1, W_TMP1 por W, W_TMP1 movdqa W_TMP2, W psrld $30, W_TMP2 pslld $2, W .elseif ((i & 3) == 3) pxor W, W_TMP1 pxor W_TMP2, W_TMP1 movdqa W_TMP1, W paddd K_XMM(K_BASE), W_TMP1 movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm /* message scheduling pre-compute for rounds 32-79 * * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken */ .macro W_PRECALC_32_79_SSSE3 .if ((i & 3) == 0) movdqa W_minus_04, W_TMP1 pxor W_minus_28, W # W is W_minus_32 before xor palignr $8, W_minus_08, W_TMP1 .elseif ((i & 3) == 1) pxor W_minus_16, W pxor W_TMP1, W movdqa W, W_TMP1 .elseif ((i & 3) == 2) psrld $30, W pslld $2, W_TMP1 por W, W_TMP1 .elseif ((i & 3) == 3) movdqa W_TMP1, W paddd K_XMM(K_BASE), W_TMP1 movdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .endm // W_PRECALC_SSSE3 #define K1 0x5a827999 #define K2 0x6ed9eba1 #define K3 0x8f1bbcdc #define K4 0xca62c1d6 .section .rodata .align 16 K_XMM_AR: .long K1, K1, K1, K1 .long K2, K2, K2, K2 .long K3, K3, K3, K3 .long K4, K4, K4, K4 BSWAP_SHUFB_CTL: .long 0x00010203 .long 0x04050607 .long 0x08090a0b .long 0x0c0d0e0f .section .text W_PRECALC_SSSE3 .macro xmm_mov a, b movdqu \a,\b .endm /* * SSSE3 optimized implementation: * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, * unsigned int rounds); */ SHA1_VECTOR_ASM sha1_transform_ssse3 .macro W_PRECALC_AVX .purgem W_PRECALC_00_15 .macro W_PRECALC_00_15 W_PRECALC_00_15_AVX .endm .purgem W_PRECALC_16_31 .macro W_PRECALC_16_31 W_PRECALC_16_31_AVX .endm .purgem W_PRECALC_32_79 .macro W_PRECALC_32_79 W_PRECALC_32_79_AVX .endm .macro W_PRECALC_00_15_AVX .if ((i & 3) == 0) vmovdqu (i*4)(BUFFER_PTR), W_TMP1 .elseif ((i & 3) == 1) vpshufb XMM_SHUFB_BSWAP, W_TMP1, W .elseif ((i & 3) == 2) vpaddd (K_BASE), W, W_TMP1 .elseif ((i & 3) == 3) vmovdqa W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .macro W_PRECALC_16_31_AVX .if ((i & 3) == 0) vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] vpxor W_minus_08, W, W vpxor W_minus_16, W_TMP1, W_TMP1 .elseif ((i & 3) == 1) vpxor W_TMP1, W, W vpslldq $12, W, W_TMP2 vpslld $1, W, W_TMP1 .elseif ((i & 3) == 2) vpsrld $31, W, W vpor W, W_TMP1, W_TMP1 vpslld $2, W_TMP2, W vpsrld $30, W_TMP2, W_TMP2 .elseif ((i & 3) == 3) vpxor W, W_TMP1, W_TMP1 vpxor W_TMP2, W_TMP1, W vpaddd K_XMM(K_BASE), W, W_TMP1 vmovdqu W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .macro W_PRECALC_32_79_AVX .if ((i & 3) == 0) vpalignr $8, W_minus_08, W_minus_04, W_TMP1 vpxor W_minus_28, W, W # W is W_minus_32 before xor .elseif ((i & 3) == 1) vpxor W_minus_16, W_TMP1, W_TMP1 vpxor W_TMP1, W, W .elseif ((i & 3) == 2) vpslld $2, W, W_TMP1 vpsrld $30, W, W vpor W, W_TMP1, W .elseif ((i & 3) == 3) vpaddd K_XMM(K_BASE), W, W_TMP1 vmovdqu W_TMP1, WK(i&~3) W_PRECALC_ROTATE .endif .endm .endm // W_PRECALC_AVX W_PRECALC_AVX .purgem xmm_mov .macro xmm_mov a, b vmovdqu \a,\b .endm /* * AVX optimized implementation: * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, * unsigned int rounds); */ SHA1_VECTOR_ASM sha1_transform_avx #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif sheepdog-0.8.3/lib/sockfd_cache.c000066400000000000000000000300021237656255000166340ustar00rootroot00000000000000/* * Copyright (C) 2012-2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * The sockfd cache provides us long TCP connections connected to the nodes * in the cluster to accerlater the data transfer, which has the following * characteristics: * 0 dynamically allocated/deallocated at node granularity. * 1 cached fds are multiplexed by all threads. * 2 each session (for e.g, forward_write_obj_req) can grab one fd at a time. * 3 if there isn't any FD available from cache, use normal connect_to() and * close() internally. * 4 FD are named by IP:PORT uniquely, hence no need of resetting at * membership change. * 5 the total number of FDs is scalable to massive nodes. * 6 total 3 APIs: sheep_{get,put,del}_sockfd(). * 7 support dual connections to a single node. */ #include #include "sockfd_cache.h" #include "work.h" #include "rbtree.h" #include "util.h" #include "sheep.h" struct sockfd_cache { struct rb_root root; struct sd_rw_lock lock; int count; }; static struct sockfd_cache sockfd_cache = { .root = RB_ROOT, .lock = SD_RW_LOCK_INITIALIZER, }; /* * Suppose request size from Guest is 512k, then 4M / 512k = 8, so at * most 8 requests can be issued to the same sheep object. Based on this * assumption, '8' would be effecient for servers that only host 2~4 * Guests. * * This fd count will be dynamically grown when the idx reaches watermark which * is calculated by FDS_WATERMARK */ #define FDS_WATERMARK(x) ((x) * 3 / 4) #define DEFAULT_FDS_COUNT 8 /* How many FDs we cache for one node */ static int fds_count = DEFAULT_FDS_COUNT; struct sockfd_cache_fd { int fd; uatomic_bool in_use; }; struct sockfd_cache_entry { struct rb_node rb; struct node_id nid; struct sockfd_cache_fd *fds; }; static int sockfd_cache_cmp(const struct sockfd_cache_entry *a, const struct sockfd_cache_entry *b) { return node_id_cmp(&a->nid, &b->nid); } static struct sockfd_cache_entry * sockfd_cache_insert(struct sockfd_cache_entry *new) { return rb_insert(&sockfd_cache.root, new, rb, sockfd_cache_cmp); } static struct sockfd_cache_entry *sockfd_cache_search(const struct node_id *nid) { struct sockfd_cache_entry key = { .nid = *nid }; return rb_search(&sockfd_cache.root, &key, rb, sockfd_cache_cmp); } static inline int get_free_slot(struct sockfd_cache_entry *entry) { int idx = -1, i; for (i = 0; i < fds_count; i++) { if (!uatomic_set_true(&entry->fds[i].in_use)) continue; idx = i; break; } return idx; } /* * Grab a free slot of the node and inc the refcount of the slot * * If no free slot available, this typically means we should use short FD. */ static struct sockfd_cache_entry *sockfd_cache_grab(const struct node_id *nid, int *ret_idx) { struct sockfd_cache_entry *entry; sd_read_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (!entry) { sd_debug("failed node %s", addr_to_str(nid->addr, nid->port)); goto out; } *ret_idx = get_free_slot(entry); if (*ret_idx == -1) entry = NULL; out: sd_rw_unlock(&sockfd_cache.lock); return entry; } static inline bool slots_all_free(struct sockfd_cache_entry *entry) { int i; for (i = 0; i < fds_count; i++) if (uatomic_is_true(&entry->fds[i].in_use)) return false; return true; } static inline void destroy_all_slots(struct sockfd_cache_entry *entry) { int i; for (i = 0; i < fds_count; i++) if (entry->fds[i].fd != -1) close(entry->fds[i].fd); } static void free_cache_entry(struct sockfd_cache_entry *entry) { free(entry->fds); free(entry); } /* * Destroy all the Cached FDs of the node * * We don't proceed if some other node grab one FD of the node. In this case, * the victim node will finally find itself talking to a dead node and call * sockfd_cache_del() to delete this node from the cache. */ static bool sockfd_cache_destroy(const struct node_id *nid) { struct sockfd_cache_entry *entry; sd_write_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (!entry) { sd_debug("It is already destroyed"); goto false_out; } if (!slots_all_free(entry)) { sd_debug("Some victim still holds it"); goto false_out; } rb_erase(&entry->rb, &sockfd_cache.root); sd_rw_unlock(&sockfd_cache.lock); destroy_all_slots(entry); free_cache_entry(entry); return true; false_out: sd_rw_unlock(&sockfd_cache.lock); return false; } static void sockfd_cache_add_nolock(const struct node_id *nid) { struct sockfd_cache_entry *new = xmalloc(sizeof(*new)); int i; new->fds = xzalloc(sizeof(struct sockfd_cache_fd) * fds_count); for (i = 0; i < fds_count; i++) new->fds[i].fd = -1; memcpy(&new->nid, nid, sizeof(struct node_id)); if (sockfd_cache_insert(new)) { free_cache_entry(new); return; } sockfd_cache.count++; } /* Add group of nodes to the cache */ void sockfd_cache_add_group(const struct rb_root *nroot) { struct sd_node *n; sd_write_lock(&sockfd_cache.lock); rb_for_each_entry(n, nroot, rb) { sockfd_cache_add_nolock(&n->nid); } sd_rw_unlock(&sockfd_cache.lock); } /* Add one node to the cache means we can do caching tricks on this node */ void sockfd_cache_add(const struct node_id *nid) { struct sockfd_cache_entry *new; int n, i; sd_write_lock(&sockfd_cache.lock); new = xmalloc(sizeof(*new)); new->fds = xzalloc(sizeof(struct sockfd_cache_fd) * fds_count); for (i = 0; i < fds_count; i++) new->fds[i].fd = -1; memcpy(&new->nid, nid, sizeof(struct node_id)); if (sockfd_cache_insert(new)) { free_cache_entry(new); sd_rw_unlock(&sockfd_cache.lock); return; } sd_rw_unlock(&sockfd_cache.lock); n = uatomic_add_return(&sockfd_cache.count, 1); sd_debug("%s, count %d", addr_to_str(nid->addr, nid->port), n); } static uatomic_bool fds_in_grow; static int fds_high_watermark = FDS_WATERMARK(DEFAULT_FDS_COUNT); static struct work_queue *grow_wq; static void do_grow_fds(struct work *work) { struct sockfd_cache_entry *entry; int old_fds_count, new_fds_count, new_size, i; sd_debug("%d", fds_count); sd_write_lock(&sockfd_cache.lock); old_fds_count = fds_count; new_fds_count = fds_count * 2; new_size = sizeof(struct sockfd_cache_fd) * fds_count * 2; rb_for_each_entry(entry, &sockfd_cache.root, rb) { entry->fds = xrealloc(entry->fds, new_size); for (i = old_fds_count; i < new_fds_count; i++) { entry->fds[i].fd = -1; uatomic_set_false(&entry->fds[i].in_use); } } fds_count *= 2; fds_high_watermark = FDS_WATERMARK(fds_count); sd_rw_unlock(&sockfd_cache.lock); } static void grow_fds_done(struct work *work) { sd_debug("fd count has been grown into %d", fds_count); uatomic_set_false(&fds_in_grow); free(work); } static inline void check_idx(int idx) { struct work *w; if (idx <= fds_high_watermark) return; if (!uatomic_set_true(&fds_in_grow)) return; w = xmalloc(sizeof(*w)); w->fn = do_grow_fds; w->done = grow_fds_done; queue_work(grow_wq, w); } /* Add the node back if it is still alive */ static inline int revalidate_node(const struct node_id *nid) { bool use_io = nid->io_port ? true : false; int fd; if (use_io) { fd = connect_to_addr(nid->io_addr, nid->io_port); if (fd >= 0) goto alive; } fd = connect_to_addr(nid->addr, nid->port); if (fd < 0) return false; alive: close(fd); sockfd_cache_add(nid); return true; } /* Try to create/get cached IO connection. If failed, fallback to non-IO one */ static struct sockfd *sockfd_cache_get_long(const struct node_id *nid) { struct sockfd_cache_entry *entry; struct sockfd *sfd; bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int fd, idx = -1, port = use_io ? nid->io_port : nid->port; grab: entry = sockfd_cache_grab(nid, &idx); if (!entry) { /* * The node is deleted, but someone askes us to grab it. * The nid is not in the sockfd cache but probably it might be * still alive due to broken network connection or was just too * busy to serve any request that makes other nodes deleted it * from the sockfd cache. In such cases, we need to add it back. */ if (!revalidate_node(nid)) return NULL; goto grab; } check_idx(idx); if (entry->fds[idx].fd != -1) { sd_debug("%s, idx %d", addr_to_str(addr, port), idx); goto out; } /* Create a new cached connection for this node */ sd_debug("create cache connection %s idx %d", addr_to_str(addr, port), idx); fd = connect_to_addr(addr, port); if (fd < 0) { if (use_io) { sd_err("fallback to non-io connection"); fd = connect_to_addr(nid->addr, nid->port); if (fd >= 0) goto new; } uatomic_set_false(&entry->fds[idx].in_use); return NULL; } new: entry->fds[idx].fd = fd; out: sfd = xmalloc(sizeof(*sfd)); sfd->fd = entry->fds[idx].fd; sfd->idx = idx; return sfd; } static void sockfd_cache_put_long(const struct node_id *nid, int idx) { bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int port = use_io ? nid->io_port : nid->port; struct sockfd_cache_entry *entry; sd_debug("%s idx %d", addr_to_str(addr, port), idx); sd_read_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (entry) uatomic_set_false(&entry->fds[idx].in_use); sd_rw_unlock(&sockfd_cache.lock); } static void sockfd_cache_close(const struct node_id *nid, int idx) { bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int port = use_io ? nid->io_port : nid->port; struct sockfd_cache_entry *entry; sd_debug("%s idx %d", addr_to_str(addr, port), idx); sd_write_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (entry) { close(entry->fds[idx].fd); entry->fds[idx].fd = -1; uatomic_set_false(&entry->fds[idx].in_use); } sd_rw_unlock(&sockfd_cache.lock); } /* * Create work queue for growing fds. * Before this function called, growing cannot be done. */ int sockfd_init(void) { grow_wq = create_ordered_work_queue("sockfd_grow"); if (!grow_wq) { sd_err("error at creating workqueue for sockfd growth"); return -1; } return 0; } /* * Return a sockfd connected to the node to the caller * * Try to get a 'long' FD as best, which is cached and never closed. If no FD * available, we return a 'short' FD which is supposed to be closed by * sockfd_cache_put(). * * ret_idx is opaque to the caller, -1 indicates it is a short FD. */ struct sockfd *sockfd_cache_get(const struct node_id *nid) { struct sockfd *sfd; int fd; sfd = sockfd_cache_get_long(nid); if (sfd) return sfd; /* Fallback on a non-io connection that is to be closed shortly */ fd = connect_to_addr(nid->addr, nid->port); if (fd < 0) return NULL; sfd = xmalloc(sizeof(*sfd)); sfd->idx = -1; sfd->fd = fd; sd_debug("%d", fd); return sfd; } /* * Release a sockfd connected to the node, which is acquired from * sockfd_cache_get() * * If it is a long FD, just decrease the refcount to make it available again. * If it is a short FD, close it. */ void sockfd_cache_put(const struct node_id *nid, struct sockfd *sfd) { if (sfd->idx == -1) { sd_debug("%d", sfd->fd); close(sfd->fd); free(sfd); return; } sockfd_cache_put_long(nid, sfd->idx); free(sfd); } /* Delete all sockfd connected to the node, when node is crashed. */ void sockfd_cache_del_node(const struct node_id *nid) { int n; if (!sockfd_cache_destroy(nid)) return; n = uatomic_sub_return(&sockfd_cache.count, 1); sd_debug("%s, count %d", addr_to_str(nid->addr, nid->port), n); } /* * Delete a sockfd connected to the node. * * If it is a long FD, de-refcount it and tres to destroy all the cached FDs of * this node in the cache. * If it is a short FD, just close it. */ void sockfd_cache_del(const struct node_id *nid, struct sockfd *sfd) { if (sfd->idx == -1) { sd_debug("%d", sfd->fd); close(sfd->fd); free(sfd); return; } sockfd_cache_close(nid, sfd->idx); sockfd_cache_del_node(nid); free(sfd); } sheepdog-0.8.3/lib/strbuf.c000066400000000000000000000107351237656255000155600ustar00rootroot00000000000000/* * Taken from git by Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "strbuf.h" #include "logger.h" #include "util.h" void strbuf_init(struct strbuf *sb, size_t hint) { memset(sb, 0, sizeof(*sb)); if (hint) strbuf_grow(sb, hint); } void strbuf_release(struct strbuf *sb) { free(sb->buf); memset(sb, 0, sizeof(*sb)); } void strbuf_reset(struct strbuf *sb) { if (sb->len) strbuf_setlen(sb, 0); sb->eof = 0; } char *strbuf_detach(struct strbuf *sb) { char *res = sb->buf; strbuf_init(sb, 0); return res; } void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc) { strbuf_release(sb); sb->buf = buf; sb->len = len; sb->alloc = alloc; strbuf_grow(sb, 0); sb->buf[sb->len] = '\0'; } void strbuf_grow(struct strbuf *sb, size_t extra) { if (unlikely(sb->len + extra + 1 <= sb->len)) panic("you want to use way too much memory"); ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc); } void strbuf_rtrim(struct strbuf *sb) { while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) sb->len--; sb->buf[sb->len] = '\0'; } void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len) { strbuf_grow(sb, len); if (unlikely(pos > sb->len)) panic("`pos' is too far after the end of the buffer"); memmove(sb->buf + pos + len, sb->buf + pos, sb->len - pos); memcpy(sb->buf + pos, data, len); strbuf_setlen(sb, sb->len + len); } void strbuf_splice(struct strbuf *sb, size_t pos, size_t len, const void *data, size_t dlen) { if (unlikely(pos + len < pos)) panic("you want to use way too much memory"); if (unlikely(pos > sb->len)) panic("`pos' is too far after the end of the buffer"); if (unlikely(pos + len > sb->len)) panic("`pos + len' is too far after the end of the buffer"); if (dlen >= len) strbuf_grow(sb, dlen - len); memmove(sb->buf + pos + dlen, sb->buf + pos + len, sb->len - pos - len); memcpy(sb->buf + pos, data, dlen); strbuf_setlen(sb, sb->len + dlen - len); } void strbuf_remove(struct strbuf *sb, size_t pos, size_t len) { strbuf_splice(sb, pos, len, NULL, 0); } void strbuf_add(struct strbuf *sb, const void *data, size_t len) { strbuf_grow(sb, len); memcpy(sb->buf + sb->len, data, len); strbuf_setlen(sb, sb->len + len); } void strbuf_addf(struct strbuf *sb, const char *fmt, ...) { int len; va_list ap; va_start(ap, fmt); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); va_end(ap); if (len < 0) len = 0; if (len > strbuf_avail(sb)) { strbuf_grow(sb, len); va_start(ap, fmt); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); va_end(ap); if (unlikely(len > strbuf_avail(sb))) panic("this should not happen, your snprintf is broken"); } strbuf_setlen(sb, sb->len + len); } size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f) { size_t res; strbuf_grow(sb, size); res = fread(sb->buf + sb->len, 1, size, f); if (res > 0) strbuf_setlen(sb, sb->len + res); return res; } ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint) { size_t oldlen = sb->len; strbuf_grow(sb, hint ? hint : 8192); for (;;) { ssize_t cnt; cnt = xread(fd, sb->buf + sb->len, sb->alloc - sb->len - 1); if (cnt < 0) { strbuf_setlen(sb, oldlen); return -1; } if (!cnt) break; sb->len += cnt; strbuf_grow(sb, 8192); } sb->buf[sb->len] = '\0'; return sb->len - oldlen; } static int strbuf_getwholeline(struct strbuf *sb, FILE *fp, int term) { int ch; if (feof(fp)) return EOF; strbuf_reset(sb); while ((ch = fgetc(fp)) != EOF) { strbuf_grow(sb, 1); sb->buf[sb->len++] = ch; if (ch == term) break; } if (ch == EOF && sb->len == 0) return EOF; sb->buf[sb->len] = '\0'; return 0; } int strbuf_getline(struct strbuf *sb, FILE *fp, int term) { if (strbuf_getwholeline(sb, fp, term)) return EOF; if (sb->buf[sb->len-1] == term) strbuf_setlen(sb, sb->len-1); return 0; } int strbuf_copyout(struct strbuf *sb, void *buf, size_t len) { len = min(len, sb->len + 1); memcpy(buf, sb->buf, len); return len; } int strbuf_stripout(struct strbuf *sb, void *buf, size_t len) { len = min(len, sb->len); if (len == 0) goto out; memcpy(buf, sb->buf, len); strbuf_remove(sb, 0, len); out: return len; } sheepdog-0.8.3/lib/util.c000066400000000000000000000364151237656255000152330ustar00rootroot00000000000000/* * Taken and modfied from git by Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "util.h" mode_t sd_def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP; mode_t sd_def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; static void do_nothing(size_t size) { } static void (*try_to_free_routine)(size_t size) = do_nothing; try_to_free_t set_try_to_free_routine(try_to_free_t routine) { try_to_free_t old = try_to_free_routine; if (!routine) routine = do_nothing; try_to_free_routine = routine; return old; } void *xmalloc(size_t size) { void *ret = malloc(size); if (unlikely(!ret) && unlikely(!size)) ret = malloc(1); if (unlikely(!ret)) { try_to_free_routine(size); ret = malloc(size); if (!ret && !size) ret = malloc(1); if (!ret) panic("Out of memory"); } return ret; } void *xzalloc(size_t size) { return xcalloc(1, size); } void *xrealloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (unlikely(!ret) && unlikely(!size)) ret = realloc(ptr, 1); if (unlikely(!ret)) { try_to_free_routine(size); ret = realloc(ptr, size); if (!ret && !size) ret = realloc(ptr, 1); if (!ret) panic("Out of memory"); } return ret; } void *xcalloc(size_t nmemb, size_t size) { void *ret = calloc(nmemb, size); if (unlikely(!ret) && unlikely(!nmemb || !size)) ret = calloc(1, 1); if (unlikely(!ret)) { try_to_free_routine(nmemb * size); ret = calloc(nmemb, size); if (!ret && (!nmemb || !size)) ret = calloc(1, 1); if (!ret) panic("Out of memory"); } return ret; } /* zeroed memory version of valloc() */ void *xvalloc(size_t size) { void *ret = valloc(size); if (unlikely(!ret)) panic("Out of memory"); memset(ret, 0, size); return ret; } static ssize_t _read(int fd, void *buf, size_t len) { ssize_t nr; while (true) { nr = read(fd, buf, len); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } static ssize_t _write(int fd, const void *buf, size_t len) { ssize_t nr; while (true) { nr = write(fd, buf, len); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } ssize_t xread(int fd, void *buf, size_t count) { char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t loaded = _read(fd, p, count); if (unlikely(loaded < 0)) return -1; if (unlikely(loaded == 0)) return total; count -= loaded; p += loaded; total += loaded; } return total; } ssize_t xwrite(int fd, const void *buf, size_t count) { const char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t written = _write(fd, p, count); if (unlikely(written < 0)) return -1; if (unlikely(!written)) { errno = ENOSPC; return -1; } count -= written; p += written; total += written; } return total; } static ssize_t _pread(int fd, void *buf, size_t len, off_t offset) { ssize_t nr; while (true) { nr = pread(fd, buf, len, offset); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } static ssize_t _pwrite(int fd, const void *buf, size_t len, off_t offset) { ssize_t nr; while (true) { nr = pwrite(fd, buf, len, offset); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } ssize_t xpread(int fd, void *buf, size_t count, off_t offset) { char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t loaded = _pread(fd, p, count, offset); if (unlikely(loaded < 0)) return -1; if (unlikely(loaded == 0)) return total; count -= loaded; p += loaded; total += loaded; offset += loaded; } return total; } ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset) { const char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t written = _pwrite(fd, p, count, offset); if (unlikely(written < 0)) return -1; if (unlikely(!written)) { errno = ENOSPC; return -1; } count -= written; p += written; total += written; offset += written; } return total; } /* Return EEXIST when path exists but not a directory */ int xmkdir(const char *pathname, mode_t mode) { if (mkdir(pathname, mode) < 0) { struct stat st; if (errno != EEXIST) return -1; if (stat(pathname, &st) < 0) return -1; if (!S_ISDIR(st.st_mode)) { errno = EEXIST; return -1; } } return 0; } int xfallocate(int fd, int mode, off_t offset, off_t len) { int ret; do { ret = fallocate(fd, mode, offset, len); } while (unlikely(ret < 0) && (errno == EAGAIN || errno == EINTR)); return ret; } int xftruncate(int fd, off_t length) { int ret; do { ret = ftruncate(fd, length); } while (unlikely(ret < 0) && (errno == EAGAIN || errno == EINTR)); return ret; } /* * Return the read value on success, or -1 if efd has been made nonblocking and * errno is EAGAIN. If efd has been marked blocking or the eventfd counter is * not zero, this function doesn't return error. */ int eventfd_xread(int efd) { int ret; eventfd_t value = 0; do { ret = eventfd_read(efd, &value); } while (unlikely(ret < 0) && errno == EINTR); if (ret == 0) ret = value; else if (unlikely(errno != EAGAIN)) panic("eventfd_read() failed, %m"); return ret; } void eventfd_xwrite(int efd, int value) { int ret; do { ret = eventfd_write(efd, (eventfd_t)value); } while (unlikely(ret < 0) && (errno == EINTR || errno == EAGAIN)); if (unlikely(ret < 0)) panic("eventfd_write() failed, %m"); } /* * Copy the string str to buf. If str length is bigger than buf_size - * 1 then it is clamped to buf_size - 1. * NOTE: this function does what strncpy should have done to be * useful. NEVER use strncpy. * * @param buf destination buffer * @param buf_size size of destination buffer * @param str source string */ void pstrcpy(char *buf, int buf_size, const char *str) { int c; char *q = buf; if (buf_size <= 0) return; while (true) { c = *str++; if (c == 0 || q >= buf + buf_size - 1) break; *q++ = c; } *q = '\0'; } /* remove a newline character from the end of a string */ char *chomp(char *str) { char *p = strchr(str, '\n'); if (p != NULL) *p = '\0'; return str; } /* Purge directory recursively */ int purge_directory(const char *dir_path) { int ret = 0; struct stat s; DIR *dir; struct dirent *d; char path[PATH_MAX]; dir = opendir(dir_path); if (!dir) { if (errno != ENOENT) sd_err("failed to open %s: %m", dir_path); return -errno; } while ((d = readdir(dir))) { if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) continue; snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name); ret = stat(path, &s); if (ret) { sd_err("failed to stat %s: %m", path); goto out; } if (S_ISDIR(s.st_mode)) ret = rmdir_r(path); else ret = unlink(path); if (ret != 0) { sd_err("failed to remove %s %s: %m", S_ISDIR(s.st_mode) ? "directory" : "file", path); goto out; } } out: closedir(dir); return ret; } /* remove directory recursively */ int rmdir_r(const char *dir_path) { int ret; ret = purge_directory(dir_path); if (ret == 0) ret = rmdir(dir_path); return ret; } bool is_numeric(const char *s) { const char *p = s; if (*p) { char c; while ((c = *p++)) if (!isdigit(c)) return false; return true; } return false; } /* * We regard 'data' as string when it contains '\0' in the first 256 characters. */ const char *data_to_str(void *data, size_t data_length) { data_length = MIN(data_length, 256); if (data == NULL) return "(null)"; if (memchr(data, '\0', data_length) != NULL) return data; return "(not string)"; } /* * If 'once' is true, the signal will be restored to the default state * after 'handler' is called. */ int install_sighandler(int signum, void (*handler)(int), bool once) { struct sigaction sa = {}; sa.sa_handler = handler; if (once) sa.sa_flags = SA_RESETHAND | SA_NODEFER; sigemptyset(&sa.sa_mask); return sigaction(signum, &sa, NULL); } int install_crash_handler(void (*handler)(int)) { return install_sighandler(SIGSEGV, handler, true) || install_sighandler(SIGABRT, handler, true) || install_sighandler(SIGBUS, handler, true) || install_sighandler(SIGILL, handler, true) || install_sighandler(SIGFPE, handler, true); } /* * Re-raise the signal 'signo' for the default signal handler to dump * a core file, and exit with 'status' if the default handler cannot * terminate the process. This function is expected to be called in * the installed signal handlers with install_crash_handler(). */ void reraise_crash_signal(int signo, int status) { int ret = raise(signo); /* We won't get here normally. */ if (ret != 0) sd_emerg("failed to re-raise signal %d (%s).", signo, strsignal(signo)); else sd_emerg("default handler for the re-raised " "signal %d (%s) didn't work expectedly", signo, strsignal(signo)); exit(status); } pid_t gettid(void) { return syscall(SYS_gettid); } int tkill(int tid, int sig) { return syscall(SYS_tgkill, getpid(), tid, sig); } bool is_xattr_enabled(const char *path) { int ret, dummy; ret = getxattr(path, "user.dummy", &dummy, sizeof(dummy)); return !(ret == -1 && errno == ENOTSUP); } const char *my_exe_path(void) { static __thread char path[PATH_MAX]; int ret; if (path[0] == '\0') { ret = readlink("/proc/self/exe", path, sizeof(path)); if (ret < -1) panic("%m"); } return path; } /* * Split the given path and sets the splitted parts to 'segs'. * * This returns the number of splitted segments. * * For example: * split_path("/a/b/c", 3, segs); * -> Returns 3 and segs will be { "a", "b", "c" }. * split_path("/a//b//c", 3, segs); * -> Returns 3 and segs will be { "a", "b", "c" }. * split_path("/a/b/c", 2, segs); * -> Returns 2 and segs will be { "a", "b/c" }. * split_path("/a/b/c", 4, segs); * -> Returns 3 and segs will be { "a", "b", "c", undefined }. */ int split_path(const char *path, size_t nr_segs, char **segs) { for (int i = 0; i < nr_segs; i++) { while (*path == '/') path++; if (*path == '\0') return i; if (i == nr_segs - 1) { segs[i] = strdup(path); if (segs[i] == NULL) panic("OOM"); } else { char *p = strchrnul(path, '/'); int len = p - path; segs[i] = xmalloc(len + 1); memcpy(segs[i], path, len); segs[i][len] = '\0'; path = p; } } return nr_segs; } /* Concatenate 'segs' with '/' separators. */ void make_path(char *path, size_t size, size_t nr_segs, const char **segs) { for (int i = 0; i < nr_segs; i++) { int len = snprintf(path, size, "/%s", segs[i]); path += len; size -= len; } } /* * If force_create is true, this function create the file even when the * temporary file exists. */ int atomic_create_and_write(const char *path, const char *buf, size_t len, bool force_create) { int fd, ret; char tmp_path[PATH_MAX]; snprintf(tmp_path, PATH_MAX, "%s.tmp", path); again: fd = open(tmp_path, O_WRONLY | O_CREAT | O_SYNC | O_EXCL, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { if (force_create) { sd_debug("clean up a temporary file %s", tmp_path); unlink(tmp_path); goto again; } else sd_debug("someone else is dealing with %s", tmp_path); } else sd_err("failed to open temporal file %s, %m", tmp_path); ret = -1; goto end; } ret = xwrite(fd, buf, len); if (unlikely(ret != len)) { sd_err("failed to write %s, %m", path); ret = -1; goto close_fd; } ret = rename(tmp_path, path); if (unlikely(ret < 0)) { sd_err("failed to rename %s, %m", path); ret = -1; } close_fd: close(fd); end: return ret; } /* * Returns a list organized in an intermediate format suited * to chaining of merge() calls: null-terminated, no reserved or * sentinel head node, "prev" links not maintained. */ static struct list_node *merge(void *priv, int (*cmp)(void *priv, struct list_node *a, struct list_node *b), struct list_node *a, struct list_node *b) { struct list_node head, *tail = &head; while (a && b) { /* if equal, take 'a' -- important for sort stability */ if ((*cmp)(priv, a, b) <= 0) { tail->next = a; a = a->next; } else { tail->next = b; b = b->next; } tail = tail->next; } tail->next = a?:b; return head.next; } /* * Combine final list merge with restoration of standard doubly-linked * list structure. This approach duplicates code from merge(), but * runs faster than the tidier alternatives of either a separate final * prev-link restoration pass, or maintaining the prev links * throughout. */ static void merge_and_restore_back_links(void *priv, int (*cmp)(void *priv, struct list_node *a, struct list_node *b), struct list_head *head, struct list_node *a, struct list_node *b) { struct list_node *tail = &head->n; while (a && b) { /* if equal, take 'a' -- important for sort stability */ if ((*cmp)(priv, a, b) <= 0) { tail->next = a; a->prev = tail; a = a->next; } else { tail->next = b; b->prev = tail; b = b->next; } tail = tail->next; } tail->next = a ? : b; do { /* * In worst cases this loop may run many iterations. * Continue callbacks to the client even though no * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; } while (tail->next); tail->next = &head->n; head->n.prev = tail; } /* * list_sort - sort a list * @priv: private data, opaque to list_sort(), passed to @cmp * @head: the list to sort * @cmp: the elements comparison function * * This function implements "merge sort", which has O(nlog(n)) * complexity. * * The comparison function @cmp must return a negative value if @a * should sort before @b, and a positive value if @a should sort after * @b. If @a and @b are equivalent, and their original relative * ordering is to be preserved, @cmp must return 0. */ void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_node *a, struct list_node *b)) { /* sorted partial lists -- last slot is a sentinel */ #define MAX_LIST_LENGTH_BITS 20 struct list_node *part[MAX_LIST_LENGTH_BITS+1]; int lev; /* index into part[] */ int max_lev = 0; struct list_node *list; if (list_empty(head)) return; memset(part, 0, sizeof(part)); head->n.prev->next = NULL; list = head->n.next; while (list) { struct list_node *cur = list; list = list->next; cur->next = NULL; for (lev = 0; part[lev]; lev++) { cur = merge(priv, cmp, part[lev], cur); part[lev] = NULL; } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { /* * list passed to list_sort() too long for * efficiency */ lev--; } max_lev = lev; } part[lev] = cur; } for (lev = 0; lev < max_lev; lev++) if (part[lev]) list = merge(priv, cmp, part[lev], list); merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } sheepdog-0.8.3/lib/work.c000066400000000000000000000236241237656255000152360ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on bs.c from Linux target framework (tgt): * Copyright (C) 2007 FUJITA Tomonori * Copyright (C) 2007 Mike Christie */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "list.h" #include "util.h" #include "bitops.h" #include "work.h" #include "event.h" /* * The protection period from shrinking work queue. This is necessary * to avoid many calls of pthread_create. Without it, threads are * frequently created and deleted and it leads poor performance. */ #define WQ_PROTECTION_PERIOD 1000 /* ms */ struct wq_info { const char *name; struct list_head finished_list; struct list_node list; struct sd_mutex finished_lock; struct sd_mutex startup_lock; /* wokers sleep on this and signaled by work producer */ struct sd_cond pending_cond; /* locked by work producer and workers */ struct sd_mutex pending_lock; /* protected by pending_lock */ struct work_queue q; size_t nr_threads; /* protected by uatomic primitives */ size_t nr_queued_work; /* we cannot shrink work queue till this time */ uint64_t tm_end_of_protection; enum wq_thread_control tc; }; static int efd; static LIST_HEAD(wq_info_list); static size_t nr_nodes = 1; static size_t (*wq_get_nr_nodes)(void); static void *worker_routine(void *arg); #ifdef HAVE_TRACE #define TID_MAX_DEFAULT 0x8000 /* default maximum tid for most systems */ static size_t tid_max; static unsigned long *tid_map; static struct sd_mutex tid_map_lock = SD_MUTEX_INITIALIZER; static int resume_efd; static int ack_efd; void suspend_worker_threads(void) { struct wq_info *wi; int tid; list_for_each_entry(wi, &wq_info_list, list) { sd_mutex_lock(&wi->pending_lock); } FOR_EACH_BIT(tid, tid_map, tid_max) { if (unlikely(tkill(tid, SIGUSR2) < 0)) panic("%m"); } /* * Wait for all the worker thread to suspend. We cannot use * wi->nr_threads here because some thread may have not called set_bit() * yet (then, the thread doesn't recieve SIGUSR2). */ FOR_EACH_BIT(tid, tid_map, tid_max) { eventfd_xread(ack_efd); } } void resume_worker_threads(void) { struct wq_info *wi; int nr_threads = 0, tid; FOR_EACH_BIT(tid, tid_map, tid_max) { nr_threads++; } eventfd_xwrite(resume_efd, nr_threads); for (int i = 0; i < nr_threads; i++) eventfd_xread(ack_efd); list_for_each_entry(wi, &wq_info_list, list) { sd_mutex_unlock(&wi->pending_lock); } } static void suspend(int num) { int uninitialized_var(value); eventfd_xwrite(ack_efd, 1); /* ack of suspend */ value = eventfd_xread(resume_efd); assert(value == 1); eventfd_xwrite(ack_efd, 1); /* ack of resume */ } static int wq_trace_init(void) { tid_max = TID_MAX_DEFAULT; tid_map = alloc_bitmap(NULL, 0, tid_max); resume_efd = eventfd(0, EFD_SEMAPHORE); ack_efd = eventfd(0, EFD_SEMAPHORE); if (resume_efd < 0 || ack_efd < 0) { sd_err("failed to create event fds: %m"); return -1; } /* trace uses this signal to suspend the worker threads */ if (install_sighandler(SIGUSR2, suspend, false) < 0) { sd_debug("%m"); return -1; } return 0; } static void trace_set_tid_map(int tid) { sd_mutex_lock(&tid_map_lock); if (tid > tid_max) { size_t old_tid_max = tid_max; /* enlarge bitmap size */ while (tid > tid_max) tid_max *= 2; tid_map = alloc_bitmap(tid_map, old_tid_max, tid_max); } set_bit(tid, tid_map); sd_mutex_unlock(&tid_map_lock); } static void trace_clear_tid_map(int tid) { sd_mutex_lock(&tid_map_lock); clear_bit(tid, tid_map); sd_mutex_unlock(&tid_map_lock); } #else static inline int wq_trace_init(void) { return 0; } static inline void trace_set_tid_map(int tid) {} static inline void trace_clear_tid_map(int tid) {} #endif /* HAVE_TRACE */ static uint64_t get_msec_time(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000 + tv.tv_usec / 1000; } static inline uint64_t wq_get_roof(struct wq_info *wi) { uint64_t nr = 1; switch (wi->tc) { case WQ_ORDERED: break; case WQ_DYNAMIC: /* FIXME: 2 * nr_nodes threads. No rationale yet. */ nr = nr_nodes * 2; break; case WQ_UNLIMITED: nr = SIZE_MAX; break; default: panic("Invalid threads control %d", wi->tc); } return nr; } static bool wq_need_grow(struct wq_info *wi) { if (wi->nr_threads < uatomic_read(&wi->nr_queued_work) && wi->nr_threads * 2 <= wq_get_roof(wi)) { wi->tm_end_of_protection = get_msec_time() + WQ_PROTECTION_PERIOD; return true; } return false; } /* * Return true if more than half of threads are not used more than * WQ_PROTECTION_PERIOD seconds */ static bool wq_need_shrink(struct wq_info *wi) { if (uatomic_read(&wi->nr_queued_work) < wi->nr_threads / 2) /* we cannot shrink work queue during protection period. */ return wi->tm_end_of_protection <= get_msec_time(); /* update the end of protection time */ wi->tm_end_of_protection = get_msec_time() + WQ_PROTECTION_PERIOD; return false; } static int create_worker_threads(struct wq_info *wi, size_t nr_threads) { pthread_t thread; int ret; sd_mutex_lock(&wi->startup_lock); while (wi->nr_threads < nr_threads) { ret = pthread_create(&thread, NULL, worker_routine, wi); if (ret != 0) { sd_err("failed to create worker thread: %m"); sd_mutex_unlock(&wi->startup_lock); return -1; } wi->nr_threads++; sd_debug("create thread %s %zu", wi->name, wi->nr_threads); } sd_mutex_unlock(&wi->startup_lock); return 0; } void queue_work(struct work_queue *q, struct work *work) { struct wq_info *wi = container_of(q, struct wq_info, q); uatomic_inc(&wi->nr_queued_work); sd_mutex_lock(&wi->pending_lock); if (wq_need_grow(wi)) /* double the thread pool size */ create_worker_threads(wi, wi->nr_threads * 2); list_add_tail(&work->w_list, &wi->q.pending_list); sd_mutex_unlock(&wi->pending_lock); sd_cond_signal(&wi->pending_cond); } static void worker_thread_request_done(int fd, int events, void *data) { struct wq_info *wi; struct work *work; LIST_HEAD(list); if (wq_get_nr_nodes) nr_nodes = wq_get_nr_nodes(); eventfd_xread(fd); list_for_each_entry(wi, &wq_info_list, list) { sd_mutex_lock(&wi->finished_lock); list_splice_init(&wi->finished_list, &list); sd_mutex_unlock(&wi->finished_lock); while (!list_empty(&list)) { work = list_first_entry(&list, struct work, w_list); list_del(&work->w_list); work->done(work); uatomic_dec(&wi->nr_queued_work); } } } static void *worker_routine(void *arg) { struct wq_info *wi = arg; struct work *work; int tid = gettid(); set_thread_name(wi->name, (wi->tc != WQ_ORDERED)); sd_mutex_lock(&wi->startup_lock); /* started this thread */ sd_mutex_unlock(&wi->startup_lock); trace_set_tid_map(tid); while (true) { sd_mutex_lock(&wi->pending_lock); if (wq_need_shrink(wi)) { wi->nr_threads--; trace_clear_tid_map(tid); sd_mutex_unlock(&wi->pending_lock); pthread_detach(pthread_self()); sd_debug("destroy thread %s %d, %zu", wi->name, tid, wi->nr_threads); break; } retest: if (list_empty(&wi->q.pending_list)) { sd_cond_wait(&wi->pending_cond, &wi->pending_lock); goto retest; } work = list_first_entry(&wi->q.pending_list, struct work, w_list); list_del(&work->w_list); sd_mutex_unlock(&wi->pending_lock); if (work->fn) work->fn(work); sd_mutex_lock(&wi->finished_lock); list_add_tail(&work->w_list, &wi->finished_list); sd_mutex_unlock(&wi->finished_lock); eventfd_xwrite(efd, 1); } pthread_exit(NULL); } int init_work_queue(size_t (*get_nr_nodes)(void)) { int ret; wq_get_nr_nodes = get_nr_nodes; if (wq_get_nr_nodes) nr_nodes = wq_get_nr_nodes(); efd = eventfd(0, EFD_NONBLOCK); if (efd < 0) { sd_err("failed to create event fd: %m"); return -1; } ret = wq_trace_init(); if (ret < 0) return ret; ret = register_event(efd, worker_thread_request_done, NULL); if (ret) { sd_err("failed to register event fd %m"); close(efd); return -1; } return 0; } /* * Allowing unlimited threads to be created is necessary to solve the following * problems: * * 1. timeout of IO requests from guests. With on-demand short threads, we * guarantee that there is always one thread available to execute the * request as soon as possible. * 2. sheep halt for corner case that all gateway and io threads are executing * local requests that ask for creation of another thread to execute the * requests and sleep-wait for responses. */ struct work_queue *create_work_queue(const char *name, enum wq_thread_control tc) { int ret; struct wq_info *wi; wi = xzalloc(sizeof(*wi)); wi->name = name; wi->tc = tc; INIT_LIST_HEAD(&wi->q.pending_list); INIT_LIST_HEAD(&wi->finished_list); sd_cond_init(&wi->pending_cond); sd_init_mutex(&wi->finished_lock); sd_init_mutex(&wi->pending_lock); sd_init_mutex(&wi->startup_lock); ret = create_worker_threads(wi, 1); if (ret < 0) goto destroy_threads; list_add(&wi->list, &wq_info_list); return &wi->q; destroy_threads: sd_mutex_unlock(&wi->startup_lock); sd_destroy_cond(&wi->pending_cond); sd_destroy_mutex(&wi->pending_lock); sd_destroy_mutex(&wi->finished_lock); free(wi); return NULL; } struct work_queue *create_ordered_work_queue(const char *name) { return create_work_queue(name, WQ_ORDERED); } bool work_queue_empty(struct work_queue *q) { struct wq_info *wi = container_of(q, struct wq_info, q); return uatomic_read(&wi->nr_queued_work) == 0; } sheepdog-0.8.3/man/000077500000000000000000000000001237656255000141065ustar00rootroot00000000000000sheepdog-0.8.3/man/Makefile.am000066400000000000000000000007461237656255000161510ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in dist_man_MANS = sheep.8 dog.8 if BUILD_SHEEPFS dist_man_MANS += sheepfs.8 endif EXTRA_DIST = sheep.8.in dog.8.in sheepfs.8.in %.8: %.8.in Makefile $(top_srcdir)/script/gen_man.pl $(top_builddir)/%/$* rm -f $@-t $@ @sed \ -e "s#@DATE@#`date '+%Y-%m-%d'`#g" \ -e "s#@OPTIONS@#$(shell $(top_srcdir)/script/gen_man.pl $(top_builddir)/$*/$*)#g" \ $< > $@-t mv $@-t $@ all-local: $(dist_man_MANS) clean-local: rm -rf $(dist_man_MANS) sheepdog-0.8.3/man/dog.8.in000066400000000000000000000021061237656255000153540ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME dog \- Command line utility for the sheep daemon .SH SYNOPSIS .B "dog [options]" .SH DESCRIPTION .B dog - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). For more information, run 'dog --help'. .SH COMMAND & SUBCOMMAND @OPTIONS@ .SH DEPENDENCIES \fBSheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z or 2.y.z. .SH FILES none .SH SEE ALSO .BR sheep(8), .BR qemu(1), .BR sheepfs(8), .BR corosync_overview(8) .SH AUTHORS This software is developed by the Sheepdog community which may be reached via mailing list at . .PP sheepdog-0.8.3/man/sheep.8.in000066400000000000000000000025251237656255000157140ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME sheep \- Distributed Block Storage System for QEMU .SH SYNOPSIS .B "sheep [options] [PATH]" .SH DESCRIPTION .B sheep - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). .SH OPTIONS @OPTIONS@ .SH PATH Proper LSB systems will store sheepdog files in /var/lib/sheepdog. The init script uses this directory by default. The directory must be on a filesystem with xattr support. In the case of ext3, user_xattr should be added to the mount options. mount \-o remount,user_xattr /var/lib/sheepdog .SH DEPENDENCIES \fBsheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z. .SH FILES .B /var/lib/sheepdog - Directory containing block storage information .SH SEE ALSO .BR dog(8), .BR qemu(1), .BR sheepfs(8), .BR corosync_overview(8) .SH AUTHORS This software is developed by the sheepdog community which may be reached via mailing list at . .PP sheepdog-0.8.3/man/sheepfs.8.in000066400000000000000000000037751237656255000162550ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME sheepfs \- A pseudo file system exports both Sheepdog's internal state as well as Sheepdog's storage .SH SYNOPSIS .B "sheepfs [OPTION]... MOUNTPOINT" .SH DESCRIPTION .B sheepfs - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). A pseudo file system is available via sheepfs(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). Sheepfs is a FUSE-based pseudo file system in userland to access both Sheepdog's internal state (for e.g, cluster info, vdi list) as well as Sheepdog's high reliable storage. The idea here is that its sometimes useful that we can envision our interaction with an Sheepdog's object in terms of a directory structure and filesystem operations. People might be mostly interested into sheepfs's volume directory, which export VM's volume as a pseudo block file in your local file system hierarchy, which can be used as 1. a big file abstraction, which is actually backed by Sheepdog's storage, distributed in the cluster. 2. a loop device file, which you can mount wherever you want to use it as a file system backed up by Sheepdog. 3. a loop device file for some VM's image, which you want to access(RW) its internal data. 4. storage media for other hypervisor, such as XEN This file abstraction integrates well into kernel's pagecache. .SH OPTIONS @OPTIONS@ .SH DEPENDENCIES \fBSheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z. .SH FILES none .SH SEE ALSO .BR sheep(8), .BR dog(8), .BR qemu(1), .BR corosync_overview(8) .SH AUTHORS This software is developed by the Sheepdog community which may be reached via mailing list at . .PP sheepdog-0.8.3/script/000077500000000000000000000000001237656255000146375ustar00rootroot00000000000000sheepdog-0.8.3/script/Makefile.am000066400000000000000000000014061237656255000166740ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = sheepdog.in noinst_HEADERS = checkarch.sh vditest gen_man.pl gen_bash_completion.pl initscript_SCRIPTS = sheepdog initscriptdir = $(INITDDIR) completion_DATA = dog completiondir = $(sysconfdir)/bash_completion.d dog: gen_bash_completion.pl Makefile rm -f $@-t $@ $(top_srcdir)/script/gen_bash_completion.pl $(top_builddir)/dog/dog > $@-t mv $@-t $@ %: %.in Makefile rm -f $@-t $@ sed \ -e 's#@''SBINDIR@#$(sbindir)#g' \ -e 's#@''SYSCONFDIR@#$(sysconfdir)#g' \ -e 's#@''INITDDIR@#$(INITDDIR)#g' \ -e 's#@''LOCALSTATEDIR@#$(localstatedir)#g' \ $< > $@-t chmod 0755 $@-t mv $@-t $@ all-local: $(initscript_SCRIPTS) $(completion_DATA) clean-local: rm -rf $(initscript_SCRIPTS) $(completion_DATA) sheepdog-0.8.3/script/checkarch.sh000066400000000000000000000004721237656255000171110ustar00rootroot00000000000000#!/bin/sh arch=`gcc -dumpmachine` case $arch in `echo $arch | grep x86_64`) echo -D__SIZEOF_POINTER__=8 -m64 ;; `echo $arch | grep "i[3-6]86"`) echo -D__SIZEOF_POINTER__=4 -m32 ;; *) echo ' Failed to parse your architecture. Please run $ make check32 or $ make check64 manually. ' exit 1 ;; esac sheepdog-0.8.3/script/checkpatch.pl000077500000000000000000002311651237656255000173040ustar00rootroot00000000000000#!/usr/bin/perl -w # (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp (the ugly bit) # (c) 2007,2008, Andy Whitcroft (new conditions, test suite) # (c) 2008-2010 Andy Whitcroft # Licensed under the terms of the GNU GPL License version 2 use strict; my $P = $0; $P =~ s@.*/@@g; my $V = '0.32'; use Getopt::Long qw(:config no_auto_abbrev); my $quiet = 0; my $tree = 0; my $chk_signoff = 1; my $chk_patch = 1; my $tst_only; my $emacs = 0; my $terse = 0; my $file = 0; my $check = 0; my $summary = 1; my $mailback = 0; my $summary_file = 0; my $show_types = 0; my $root; my %debug; my %ignore_type = (); my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; sub help { my ($exitcode) = @_; print << "EOM"; Usage: $P [OPTION]... [FILE]... Version: $V Options: -q, --quiet quiet --no-tree run without a kernel tree --no-signoff do not check for 'Signed-off-by' line --patch treat FILE as patchfile (default) --emacs emacs compile window format --terse one line per report -f, --file treat FILE as regular source file --subjective, --strict enable more subjective tests --ignore TYPE(,TYPE2...) ignore various comma separated message types --show-types show the message "types" in the output --root=PATH PATH to the kernel tree root --no-summary suppress the per-file summary --mailback only produce a report in case of warnings/errors --summary-file include the filename in summary --debug KEY=[0|1] turn on/off debugging of KEY, where KEY is one of 'values', 'possible', 'type', and 'attr' (default is all off) --test-only=WORD report only warnings/errors containing WORD literally -h, --help, --version display this help and exit When FILE is - read standard input. EOM exit($exitcode); } my $conf = which_conf($configuration_file); if (-f $conf) { my @conf_args; open(my $conffile, '<', "$conf") or warn "$P: Can't find a readable $configuration_file file $!\n"; while (<$conffile>) { my $line = $_; $line =~ s/\s*\n?$//g; $line =~ s/^\s*//g; $line =~ s/\s+/ /g; next if ($line =~ m/^\s*#/); next if ($line =~ m/^\s*$/); my @words = split(" ", $line); foreach my $word (@words) { last if ($word =~ m/^#/); push (@conf_args, $word); } } close($conffile); unshift(@ARGV, @conf_args) if @conf_args; } GetOptions( 'q|quiet+' => \$quiet, 'tree!' => \$tree, 'signoff!' => \$chk_signoff, 'patch!' => \$chk_patch, 'emacs!' => \$emacs, 'terse!' => \$terse, 'f|file!' => \$file, 'subjective!' => \$check, 'strict!' => \$check, 'ignore=s' => \@ignore, 'show-types!' => \$show_types, 'root=s' => \$root, 'summary!' => \$summary, 'mailback!' => \$mailback, 'summary-file!' => \$summary_file, 'debug=s' => \%debug, 'test-only=s' => \$tst_only, 'h|help' => \$help, 'version' => \$help ) or help(1); help(0) if ($help); my $exit = 0; if ($#ARGV < 0) { print "$P: no input files\n"; exit(1); } @ignore = split(/,/, join(',',@ignore)); foreach my $word (@ignore) { $word =~ s/\s*\n?$//g; $word =~ s/^\s*//g; $word =~ s/\s+/ /g; $word =~ tr/[a-z]/[A-Z]/; next if ($word =~ m/^\s*#/); next if ($word =~ m/^\s*$/); $ignore_type{$word}++; } my $dbg_values = 0; my $dbg_possible = 0; my $dbg_type = 0; my $dbg_attr = 0; for my $key (keys %debug) { ## no critic eval "\${dbg_$key} = '$debug{$key}';"; die "$@" if ($@); } my $rpt_cleaners = 0; if ($terse) { $emacs = 1; $quiet++; } if ($tree) { if (defined $root) { if (!top_of_kernel_tree($root)) { die "$P: $root: --root does not point at a valid tree\n"; } } else { if (top_of_kernel_tree('.')) { $root = '.'; } elsif ($0 =~ m@(.*)/scripts/[^/]*$@ && top_of_kernel_tree($1)) { $root = $1; } } if (!defined $root) { print "Must be run from the top-level dir. of a kernel tree\n"; exit(2); } } my $emitted_corrupt = 0; our $Ident = qr{ [A-Za-z_][A-Za-z\d_]* (?:\s*\#\#\s*[A-Za-z_][A-Za-z\d_]*)* }x; our $Storage = qr{extern|static|asmlinkage}; our $Sparse = qr{ __user| __kernel| __force| __iomem| __must_check| __init_refok| __kprobes| __ref| __rcu }x; # Notes to $Attribute: # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ const| __percpu| __nocast| __safe| __bitwise__| __packed__| __packed2__| __naked| __maybe_unused| __always_unused| __noreturn| __used| __cold| __noclone| __deprecated| __read_mostly| __kprobes| __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| __weak }x; our $Modifier; our $Inline = qr{inline|__always_inline|noinline}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; our $Constant = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ <=|>=|==|!=| =>|->|<<|>>|<|>|!|~| &&|\|\||,|\^|\+\+|--|&|\||\+|-|\*|\/|% }x; our $NonptrType; our $Type; our $Declare; our $NON_ASCII_UTF8 = qr{ [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 }x; our $UTF8 = qr{ [\x09\x0A\x0D\x20-\x7E] # ASCII | $NON_ASCII_UTF8 }x; our $typeTypedefs = qr{(?x: (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; our $logFunctions = qr{(?x: printk(?:_ratelimited|_once|)| [a-z0-9]+_(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| WARN(?:_RATELIMIT|_ONCE|)| panic| MODULE_[A-Z_]+ )}; our $signature_tags = qr{(?xi: Signed-off-by:| Acked-by:| Tested-by:| Reviewed-by:| Reported-by:| To:| Cc: )}; our @typeList = ( qr{void}, qr{(?:unsigned\s+)?char}, qr{(?:unsigned\s+)?short}, qr{(?:unsigned\s+)?int}, qr{(?:unsigned\s+)?long}, qr{(?:unsigned\s+)?long\s+int}, qr{(?:unsigned\s+)?long\s+long}, qr{(?:unsigned\s+)?long\s+long\s+int}, qr{unsigned}, qr{float}, qr{double}, qr{bool}, qr{struct\s+$Ident}, qr{union\s+$Ident}, qr{enum\s+$Ident}, qr{${Ident}_t}, qr{${Ident}_handler}, qr{${Ident}_handler_fn}, ); our @modifierList = ( qr{fastcall}, ); our $allowed_asm_includes = qr{(?x: irq| memory )}; # memory.h: ARM has a custom one sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ (?:$Modifier\s+|const\s+)* (?: (?:typeof|__typeof__)\s*\([^\)]*\)| (?:$typeTypedefs\b)| (?:${all}\b) ) (?:\s+$Modifier|\s+const)* }x; $Type = qr{ $NonptrType (?:[\s\*]+\s*const|[\s\*]+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+)?$Type}; } build_types(); our $match_balanced_parentheses = qr/(\((?:[^\(\)]+|(-1))*\))/; our $Typecast = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*}; our $LvalOrFunc = qr{($Lval)\s*($match_balanced_parentheses{0,1})\s*}; our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { my ($string) = @_; return "" if (!defined($string)); $string =~ s@^\s*\(\s*@@g; $string =~ s@\s*\)\s*$@@g; $string =~ s@\s+@ @g; return $string; } $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my $vname; for my $filename (@ARGV) { my $FILE; if ($file) { open($FILE, '-|', "diff -u /dev/null $filename") || die "$P: $filename: diff failed - $!\n"; } elsif ($filename eq '-') { open($FILE, '<&STDIN'); } else { open($FILE, '<', "$filename") || die "$P: $filename: open failed - $!\n"; } if ($filename eq '-') { $vname = 'Your patch'; } else { $vname = $filename; } while (<$FILE>) { chomp; push(@rawlines, $_); } close($FILE); if (!process($filename)) { $exit = 1; } @rawlines = (); @lines = (); } exit($exit); sub top_of_kernel_tree { my ($root) = @_; my @tree_check = ( "COPYING", "CREDITS", "Kbuild", "MAINTAINERS", "Makefile", "README", "Documentation", "arch", "include", "drivers", "fs", "init", "ipc", "kernel", "lib", "scripts", ); foreach my $check (@tree_check) { if (! -e $root . '/' . $check) { return 0; } } return 1; } sub parse_email { my ($formatted_email) = @_; my $name = ""; my $address = ""; my $comment = ""; if ($formatted_email =~ /^(.*)<(\S+\@\S+)>(.*)$/) { $name = $1; $address = $2; $comment = $3 if defined $3; } elsif ($formatted_email =~ /^\s*<(\S+\@\S+)>(.*)$/) { $address = $1; $comment = $2 if defined $2; } elsif ($formatted_email =~ /(\S+\@\S+)(.*)$/) { $address = $1; $comment = $2 if defined $2; $formatted_email =~ s/$address.*$//; $name = $formatted_email; $name =~ s/^\s+|\s+$//g; $name =~ s/^\"|\"$//g; # If there's a name left after stripping spaces and # leading quotes, and the address doesn't have both # leading and trailing angle brackets, the address # is invalid. ie: # "joe smith joe@smith.com" bad # "joe smith ]+>$/) { $name = ""; $address = ""; $comment = ""; } } $name =~ s/^\s+|\s+$//g; $name =~ s/^\"|\"$//g; $address =~ s/^\s+|\s+$//g; $address =~ s/^\<|\>$//g; if ($name =~ /[^\w \-]/i) { ##has "must quote" chars $name =~ s/(?"; } return $formatted_email; } sub which_conf { my ($conf) = @_; foreach my $path (split(/:/, ".:$ENV{HOME}:.scripts")) { if (-e "$path/$conf") { return "$path/$conf"; } } return ""; } sub expand_tabs { my ($str) = @_; my $res = ''; my $n = 0; for my $c (split(//, $str)) { if ($c eq "\t") { $res .= ' '; $n++; for (; ($n % 8) != 0; $n++) { $res .= ' '; } next; } $res .= $c; $n++; } return $res; } sub copy_spacing { (my $res = shift) =~ tr/\t/ /c; return $res; } sub line_stats { my ($line) = @_; # Drop the diff line leader and expand tabs $line =~ s/^.//; $line = expand_tabs($line); # Pick the indent from the front of the line. my ($white) = ($line =~ /^(\s*)/); return (length($line), length($white)); } my $sanitise_quote = ''; sub sanitise_line_reset { my ($in_comment) = @_; if ($in_comment) { $sanitise_quote = '*/'; } else { $sanitise_quote = ''; } } sub sanitise_line { my ($line) = @_; my $res = ''; my $l = ''; my $qlen = 0; my $off = 0; my $c; # Always copy over the diff marker. $res = substr($line, 0, 1); for ($off = 1; $off < length($line); $off++) { $c = substr($line, $off, 1); # Comments we are wacking completly including the begin # and end, all to $;. if ($sanitise_quote eq '' && substr($line, $off, 2) eq '/*') { $sanitise_quote = '*/'; substr($res, $off, 2, "$;$;"); $off++; next; } if ($sanitise_quote eq '*/' && substr($line, $off, 2) eq '*/') { $sanitise_quote = ''; substr($res, $off, 2, "$;$;"); $off++; next; } if ($sanitise_quote eq '' && substr($line, $off, 2) eq '//') { $sanitise_quote = '//'; substr($res, $off, 2, $sanitise_quote); $off++; next; } # A \ in a string means ignore the next character. if (($sanitise_quote eq "'" || $sanitise_quote eq '"') && $c eq "\\") { substr($res, $off, 2, 'XX'); $off++; next; } # Regular quotes. if ($c eq "'" || $c eq '"') { if ($sanitise_quote eq '') { $sanitise_quote = $c; substr($res, $off, 1, $c); next; } elsif ($sanitise_quote eq $c) { $sanitise_quote = ''; } } #print "c<$c> SQ<$sanitise_quote>\n"; if ($off != 0 && $sanitise_quote eq '*/' && $c ne "\t") { substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote eq '//' && $c ne "\t") { substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote && $c ne "\t") { substr($res, $off, 1, 'X'); } else { substr($res, $off, 1, $c); } } if ($sanitise_quote eq '//') { $sanitise_quote = ''; } # The pathname on a #include may be surrounded by '<' and '>'. if ($res =~ /^.\s*\#\s*include\s+\<(.*)\>/) { my $clean = 'X' x length($1); $res =~ s@\<.*\>@<$clean>@; # The whole of a #error is a string. } elsif ($res =~ /^.\s*\#\s*(?:error|warning)\s+(.*)\b/) { my $clean = 'X' x length($1); $res =~ s@(\#\s*(?:error|warning)\s+).*@$1$clean@; } return $res; } sub ctx_statement_block { my ($linenr, $remain, $off) = @_; my $line = $linenr - 1; my $blk = ''; my $soff = $off; my $coff = $off - 1; my $coff_set = 0; my $loff = 0; my $type = ''; my $level = 0; my @stack = (); my $p; my $c; my $len = 0; my $remainder; while (1) { @stack = (['', 0]) if ($#stack == -1); #warn "CSB: blk<$blk> remain<$remain>\n"; # If we are about to drop off the end, pull in more # context. if ($off >= $len) { for (; $remain > 0; $line++) { last if (!defined $lines[$line]); next if ($lines[$line] =~ /^-/); $remain--; $loff = $len; $blk .= $lines[$line] . "\n"; $len = length($blk); $line++; last; } # Bail if there is no further context. #warn "CSB: blk<$blk> off<$off> len<$len>\n"; if ($off >= $len) { last; } if ($level == 0 && substr($blk, $off) =~ /^.\s*#\s*define/) { $level++; $type = '#'; } } $p = $c; $c = substr($blk, $off, 1); $remainder = substr($blk, $off); #warn "CSB: c<$c> type<$type> level<$level> remainder<$remainder> coff_set<$coff_set>\n"; # Handle nested #if/#else. if ($remainder =~ /^#\s*(?:ifndef|ifdef|if)\s/) { push(@stack, [ $type, $level ]); } elsif ($remainder =~ /^#\s*(?:else|elif)\b/) { ($type, $level) = @{$stack[$#stack - 1]}; } elsif ($remainder =~ /^#\s*endif\b/) { ($type, $level) = @{pop(@stack)}; } # Statement ends at the ';' or a close '}' at the # outermost level. if ($level == 0 && $c eq ';') { last; } # An else is really a conditional as long as its not else if if ($level == 0 && $coff_set == 0 && (!defined($p) || $p =~ /(?:\s|\}|\+)/) && $remainder =~ /^(else)(?:\s|{)/ && $remainder !~ /^else\s+if\b/) { $coff = $off + length($1) - 1; $coff_set = 1; #warn "CSB: mark coff<$coff> soff<$soff> 1<$1>\n"; #warn "[" . substr($blk, $soff, $coff - $soff + 1) . "]\n"; } if (($type eq '' || $type eq '(') && $c eq '(') { $level++; $type = '('; } if ($type eq '(' && $c eq ')') { $level--; $type = ($level != 0)? '(' : ''; if ($level == 0 && $coff < $soff) { $coff = $off; $coff_set = 1; #warn "CSB: mark coff<$coff>\n"; } } if (($type eq '' || $type eq '{') && $c eq '{') { $level++; $type = '{'; } if ($type eq '{' && $c eq '}') { $level--; $type = ($level != 0)? '{' : ''; if ($level == 0) { if (substr($blk, $off + 1, 1) eq ';') { $off++; } last; } } # Preprocessor commands end at the newline unless escaped. if ($type eq '#' && $c eq "\n" && $p ne "\\") { $level--; $type = ''; $off++; last; } $off++; } # We are truly at the end, so shuffle to the next line. if ($off == $len) { $loff = $len + 1; $line++; $remain--; } my $statement = substr($blk, $soff, $off - $soff + 1); my $condition = substr($blk, $soff, $coff - $soff + 1); #warn "STATEMENT<$statement>\n"; #warn "CONDITION<$condition>\n"; #print "coff<$coff> soff<$off> loff<$loff>\n"; return ($statement, $condition, $line, $remain + 1, $off - $loff + 1, $level); } sub statement_lines { my ($stmt) = @_; # Strip the diff line prefixes and rip blank lines at start and end. $stmt =~ s/(^|\n)./$1/g; $stmt =~ s/^\s*//; $stmt =~ s/\s*$//; my @stmt_lines = ($stmt =~ /\n/g); return $#stmt_lines + 2; } sub statement_rawlines { my ($stmt) = @_; my @stmt_lines = ($stmt =~ /\n/g); return $#stmt_lines + 2; } sub statement_block_size { my ($stmt) = @_; $stmt =~ s/(^|\n)./$1/g; $stmt =~ s/^\s*{//; $stmt =~ s/}\s*$//; $stmt =~ s/^\s*//; $stmt =~ s/\s*$//; my @stmt_lines = ($stmt =~ /\n/g); my @stmt_statements = ($stmt =~ /;/g); my $stmt_lines = $#stmt_lines + 2; my $stmt_statements = $#stmt_statements + 1; if ($stmt_lines > $stmt_statements) { return $stmt_lines; } else { return $stmt_statements; } } sub ctx_statement_full { my ($linenr, $remain, $off) = @_; my ($statement, $condition, $level); my (@chunks); # Grab the first conditional/block pair. ($statement, $condition, $linenr, $remain, $off, $level) = ctx_statement_block($linenr, $remain, $off); #print "F: c<$condition> s<$statement> remain<$remain>\n"; push(@chunks, [ $condition, $statement ]); if (!($remain > 0 && $condition =~ /^\s*(?:\n[+-])?\s*(?:if|else|do)\b/s)) { return ($level, $linenr, @chunks); } # Pull in the following conditional/block pairs and see if they # could continue the statement. for (;;) { ($statement, $condition, $linenr, $remain, $off, $level) = ctx_statement_block($linenr, $remain, $off); #print "C: c<$condition> s<$statement> remain<$remain>\n"; last if (!($remain > 0 && $condition =~ /^(?:\s*\n[+-])*\s*(?:else|do)\b/s)); #print "C: push\n"; push(@chunks, [ $condition, $statement ]); } return ($level, $linenr, @chunks); } sub ctx_block_get { my ($linenr, $remain, $outer, $open, $close, $off) = @_; my $line; my $start = $linenr - 1; my $blk = ''; my @o; my @c; my @res = (); my $level = 0; my @stack = ($level); for ($line = $start; $remain > 0; $line++) { next if ($rawlines[$line] =~ /^-/); $remain--; $blk .= $rawlines[$line]; # Handle nested #if/#else. if ($lines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) { push(@stack, $level); } elsif ($lines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) { $level = $stack[$#stack - 1]; } elsif ($lines[$line] =~ /^.\s*#\s*endif\b/) { $level = pop(@stack); } foreach my $c (split(//, $lines[$line])) { ##print "C<$c>L<$level><$open$close>O<$off>\n"; if ($off > 0) { $off--; next; } if ($c eq $close && $level > 0) { $level--; last if ($level == 0); } elsif ($c eq $open) { $level++; } } if (!$outer || $level <= 1) { push(@res, $rawlines[$line]); } last if ($level == 0); } return ($level, @res); } sub ctx_block_outer { my ($linenr, $remain) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 1, '{', '}', 0); return @r; } sub ctx_block { my ($linenr, $remain) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 0, '{', '}', 0); return @r; } sub ctx_statement { my ($linenr, $remain, $off) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 0, '(', ')', $off); return @r; } sub ctx_block_level { my ($linenr, $remain) = @_; return ctx_block_get($linenr, $remain, 0, '{', '}', 0); } sub ctx_statement_level { my ($linenr, $remain, $off) = @_; return ctx_block_get($linenr, $remain, 0, '(', ')', $off); } sub ctx_locate_comment { my ($first_line, $end_line) = @_; # Catch a comment on the end of the line itself. my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); return $current_comment if (defined $current_comment); # Look through the context and try and figure out if there is a # comment. my $in_comment = 0; $current_comment = ''; for (my $linenr = $first_line; $linenr < $end_line; $linenr++) { my $line = $rawlines[$linenr - 1]; #warn " $line\n"; if ($linenr == $first_line and $line =~ m@^.\s*\*@) { $in_comment = 1; } if ($line =~ m@/\*@) { $in_comment = 1; } if (!$in_comment && $current_comment ne '') { $current_comment = ''; } $current_comment .= $line . "\n" if ($in_comment); if ($line =~ m@\*/@) { $in_comment = 0; } } chomp($current_comment); return($current_comment); } sub ctx_has_comment { my ($first_line, $end_line) = @_; my $cmt = ctx_locate_comment($first_line, $end_line); ##print "LINE: $rawlines[$end_line - 1 ]\n"; ##print "CMMT: $cmt\n"; return ($cmt ne ''); } sub raw_line { my ($linenr, $cnt) = @_; my $offset = $linenr - 1; $cnt++; my $line; while ($cnt) { $line = $rawlines[$offset++]; next if (defined($line) && $line =~ /^-/); $cnt--; } return $line; } sub cat_vet { my ($vet) = @_; my ($res, $coded); $res = ''; while ($vet =~ /([^[:cntrl:]]*)([[:cntrl:]]|$)/g) { $res .= $1; if ($2 ne '') { $coded = sprintf("^%c", unpack('C', $2) + 64); $res .= $coded; } } $res =~ s/$/\$/; return $res; } my $av_preprocessor = 0; my $av_pending; my @av_paren_type; my $av_pend_colon; sub annotate_reset { $av_preprocessor = 0; $av_pending = '_'; @av_paren_type = ('E'); $av_pend_colon = 'O'; } sub annotate_values { my ($stream, $type) = @_; my $res; my $var = '_' x length($stream); my $cur = $stream; print "$stream\n" if ($dbg_values > 1); while (length($cur)) { @av_paren_type = ('E') if ($#av_paren_type < 0); print " <" . join('', @av_paren_type) . "> <$type> <$av_pending>" if ($dbg_values > 1); if ($cur =~ /^(\s+)/o) { print "WS($1)\n" if ($dbg_values > 1); if ($1 =~ /\n/ && $av_preprocessor) { $type = pop(@av_paren_type); $av_preprocessor = 0; } } elsif ($cur =~ /^(\(\s*$Type\s*)\)/ && $av_pending eq '_') { print "CAST($1)\n" if ($dbg_values > 1); push(@av_paren_type, $type); $type = 'c'; } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); $type = 'T'; } elsif ($cur =~ /^($Modifier)\s*/) { print "MODIFIER($1)\n" if ($dbg_values > 1); $type = 'T'; } elsif ($cur =~ /^(\#\s*define\s*$Ident)(\(?)/o) { print "DEFINE($1,$2)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); if ($2 ne '') { $av_pending = 'N'; } $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:undef\s*$Ident|include\b))/o) { print "UNDEF($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); } elsif ($cur =~ /^(\#\s*(?:ifdef|ifndef|if))/o) { print "PRE_START($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); push(@av_paren_type, $type); $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:else|elif))/o) { print "PRE_RESTART($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $av_paren_type[$#av_paren_type]); $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:endif))/o) { print "PRE_END($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; # Assume all arms of the conditional end as this # one does, and continue as if the #endif was not here. pop(@av_paren_type); push(@av_paren_type, $type); $type = 'E'; } elsif ($cur =~ /^(\\\n)/o) { print "PRECONT($1)\n" if ($dbg_values > 1); } elsif ($cur =~ /^(__attribute__)\s*\(?/o) { print "ATTR($1)\n" if ($dbg_values > 1); $av_pending = $type; $type = 'N'; } elsif ($cur =~ /^(sizeof)\s*(\()?/o) { print "SIZEOF($1)\n" if ($dbg_values > 1); if (defined $2) { $av_pending = 'V'; } $type = 'N'; } elsif ($cur =~ /^(if|while|for)\b/o) { print "COND($1)\n" if ($dbg_values > 1); $av_pending = 'E'; $type = 'N'; } elsif ($cur =~/^(case)/o) { print "CASE($1)\n" if ($dbg_values > 1); $av_pend_colon = 'C'; $type = 'N'; } elsif ($cur =~/^(return|else|goto|typeof|__typeof__)\b/o) { print "KEYWORD($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(\()/o) { print "PAREN('$1')\n" if ($dbg_values > 1); push(@av_paren_type, $av_pending); $av_pending = '_'; $type = 'N'; } elsif ($cur =~ /^(\))/o) { my $new_type = pop(@av_paren_type); if ($new_type ne '_') { $type = $new_type; print "PAREN('$1') -> $type\n" if ($dbg_values > 1); } else { print "PAREN('$1')\n" if ($dbg_values > 1); } } elsif ($cur =~ /^($Ident)\s*\(/o) { print "FUNC($1)\n" if ($dbg_values > 1); $type = 'V'; $av_pending = 'V'; } elsif ($cur =~ /^($Ident\s*):(?:\s*\d+\s*(,|=|;))?/) { if (defined $2 && $type eq 'C' || $type eq 'T') { $av_pend_colon = 'B'; } elsif ($type eq 'E') { $av_pend_colon = 'L'; } print "IDENT_COLON($1,$type>$av_pend_colon)\n" if ($dbg_values > 1); $type = 'V'; } elsif ($cur =~ /^($Ident|$Constant)/o) { print "IDENT($1)\n" if ($dbg_values > 1); $type = 'V'; } elsif ($cur =~ /^($Assignment)/o) { print "ASSIGN($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~/^(;|{|})/) { print "END($1)\n" if ($dbg_values > 1); $type = 'E'; $av_pend_colon = 'O'; } elsif ($cur =~/^(,)/) { print "COMMA($1)\n" if ($dbg_values > 1); $type = 'C'; } elsif ($cur =~ /^(\?)/o) { print "QUESTION($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(:)/o) { print "COLON($1,$av_pend_colon)\n" if ($dbg_values > 1); substr($var, length($res), 1, $av_pend_colon); if ($av_pend_colon eq 'C' || $av_pend_colon eq 'L') { $type = 'E'; } else { $type = 'N'; } $av_pend_colon = 'O'; } elsif ($cur =~ /^(\[)/o) { print "CLOSE($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(-(?![->])|\+(?!\+)|\*|\&\&|\&)/o) { my $variant; print "OPV($1)\n" if ($dbg_values > 1); if ($type eq 'V') { $variant = 'B'; } else { $variant = 'U'; } substr($var, length($res), 1, $variant); $type = 'N'; } elsif ($cur =~ /^($Operators)/o) { print "OP($1)\n" if ($dbg_values > 1); if ($1 ne '++' && $1 ne '--') { $type = 'N'; } } elsif ($cur =~ /(^.)/o) { print "C($1)\n" if ($dbg_values > 1); } if (defined $1) { $cur = substr($cur, length($1)); $res .= $type x length($1); } } return ($res, $var); } sub possible { my ($possible, $line) = @_; my $notPermitted = qr{(?: ^(?: $Modifier| $Storage| $Type| DEFINE_\S+ )$| ^(?: goto| return| case| else| asm|__asm__| do| \#| \#\#| )(?:\s|$)| ^(?:typedef|struct|enum)\b )}x; warn "CHECK<$possible> ($line)\n" if ($dbg_possible > 2); if ($possible !~ $notPermitted) { # Check for modifiers. $possible =~ s/\s*$Storage\s*//g; $possible =~ s/\s*$Sparse\s*//g; if ($possible =~ /^\s*$/) { } elsif ($possible =~ /\s/) { $possible =~ s/\s*$Type\s*//g; for my $modifier (split(' ', $possible)) { if ($modifier !~ $notPermitted) { warn "MODIFIER: $modifier ($possible) ($line)\n" if ($dbg_possible); push(@modifierList, $modifier); } } } else { warn "POSSIBLE: $possible ($line)\n" if ($dbg_possible); push(@typeList, $possible); } build_types(); } else { warn "NOTPOSS: $possible ($line)\n" if ($dbg_possible > 1); } } my $prefix = ''; sub show_type { return !defined $ignore_type{$_[0]}; } sub report { if (!show_type($_[1]) || (defined $tst_only && $_[2] !~ /\Q$tst_only\E/)) { return 0; } my $line; if ($show_types) { $line = "$prefix$_[0]:$_[1]: $_[2]\n"; } else { $line = "$prefix$_[0]: $_[2]\n"; } $line = (split('\n', $line))[0] . "\n" if ($terse); push(our @report, $line); return 1; } sub report_dump { our @report; } sub ERROR { if (report("ERROR", $_[0], $_[1])) { our $clean = 0; our $cnt_error++; } } sub WARN { if (report("WARNING", $_[0], $_[1])) { our $clean = 0; our $cnt_warn++; } } sub CHK { if ($check && report("CHECK", $_[0], $_[1])) { our $clean = 0; our $cnt_chk++; } } sub check_absolute_file { my ($absolute, $herecurr) = @_; my $file = $absolute; ##print "absolute<$absolute>\n"; # See if any suffix of this path is a path within the tree. while ($file =~ s@^[^/]*/@@) { if (-f "$root/$file") { ##print "file<$file>\n"; last; } } if (! -f _) { return 0; } # It is, so see if the prefix is acceptable. my $prefix = $absolute; substr($prefix, -length($file)) = ''; ##print "prefix<$prefix>\n"; if ($prefix ne ".../") { WARN("USE_RELATIVE_PATH", "use relative pathname instead of absolute in changelog text\n" . $herecurr); } } sub process { my $filename = shift; my $linenr=0; my $prevline=""; my $prevrawline=""; my $stashline=""; my $stashrawline=""; my $length; my $indent; my $previndent=0; my $stashindent=0; our $clean = 1; my $signoff = 0; my $is_patch = 0; my $in_header_lines = 1; my $in_commit_log = 0; #Scanning lines before patch our @report = (); our $cnt_lines = 0; our $cnt_error = 0; our $cnt_warn = 0; our $cnt_chk = 0; # Trace the real file/line as we go. my $realfile = ''; my $realline = 0; my $realcnt = 0; my $here = ''; my $in_comment = 0; my $comment_edge = 0; my $first_line = 0; my $p1_prefix = ''; my $prev_values = 'E'; # suppression flags my %suppress_ifbraces; my %suppress_whiletrailers; my %suppress_export; my $suppress_statement = 0; # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. # my @setup_docs = (); my $setup_docs = 0; sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { $linenr++; $line = $rawline; if ($rawline=~/^\+\+\+\s+(\S+)/) { $setup_docs = 0; if ($1 =~ m@Documentation/kernel-parameters.txt$@) { $setup_docs = 1; } #next; } if ($rawline=~/^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@/) { $realline=$1-1; if (defined $2) { $realcnt=$3+1; } else { $realcnt=1+1; } $in_comment = 0; # Guestimate if this is a continuing comment. Run # the context looking for a comment "edge". If this # edge is a close comment then we must be in a comment # at context start. my $edge; my $cnt = $realcnt; for (my $ln = $linenr + 1; $cnt > 0; $ln++) { next if (defined $rawlines[$ln - 1] && $rawlines[$ln - 1] =~ /^-/); $cnt--; #print "RAW<$rawlines[$ln - 1]>\n"; last if (!defined $rawlines[$ln - 1]); if ($rawlines[$ln - 1] =~ m@(/\*|\*/)@ && $rawlines[$ln - 1] !~ m@"[^"]*(?:/\*|\*/)[^"]*"@) { ($edge) = $1; last; } } if (defined $edge && $edge eq '*/') { $in_comment = 1; } # Guestimate if this is a continuing comment. If this # is the start of a diff block and this line starts # ' *' then it is very likely a comment. if (!defined $edge && $rawlines[$linenr] =~ m@^.\s*(?:\*\*+| \*)(?:\s|$)@) { $in_comment = 1; } ##print "COMMENT:$in_comment edge<$edge> $rawline\n"; sanitise_line_reset($in_comment); } elsif ($realcnt && $rawline =~ /^(?:\+| |$)/) { # Standardise the strings and chars within the input to # simplify matching -- only bother with positive lines. $line = sanitise_line($rawline); } push(@lines, $line); if ($realcnt > 1) { $realcnt-- if ($line =~ /^(?:\+| |$)/); } else { $realcnt = 0; } #print "==>$rawline\n"; #print "-->$line\n"; if ($setup_docs && $line =~ /^\+/) { push(@setup_docs, $line); } } $prefix = ''; $realcnt = 0; $linenr = 0; foreach my $line (@lines) { $linenr++; my $rawline = $rawlines[$linenr - 1]; #extract the line range in the file after the patch is applied if ($line=~/^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@/) { $is_patch = 1; $first_line = $linenr + 1; $realline=$1-1; if (defined $2) { $realcnt=$3+1; } else { $realcnt=1+1; } annotate_reset(); $prev_values = 'E'; %suppress_ifbraces = (); %suppress_whiletrailers = (); %suppress_export = (); $suppress_statement = 0; next; # track the line number as we move through the hunk, note that # new versions of GNU diff omit the leading space on completely # blank context lines so we need to count that too. } elsif ($line =~ /^( |\+|$)/) { $realline++; $realcnt-- if ($realcnt != 0); # Measure the line length and indent. ($length, $indent) = line_stats($rawline); # Track the previous line. ($prevline, $stashline) = ($stashline, $line); ($previndent, $stashindent) = ($stashindent, $indent); ($prevrawline, $stashrawline) = ($stashrawline, $rawline); #warn "line<$line>\n"; } elsif ($realcnt == 1) { $realcnt--; } my $hunk_line = ($realcnt != 0); #make up the handle for any error we report on this line $prefix = "$filename:$realline: " if ($emacs && $file); $prefix = "$filename:$linenr: " if ($emacs && !$file); $here = "#$linenr: " if (!$file); $here = "#$realline: " if ($file); # extract the filename as it passes if ($line =~ /^diff --git.*?(\S+)$/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; $in_commit_log = 0; } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; $in_commit_log = 0; $p1_prefix = $1; if (!$file && $tree && $p1_prefix ne '' && -e "$root/$p1_prefix") { WARN("PATCH_PREFIX", "patch prefix '$p1_prefix' exists, appears to be a -p0 patch\n"); } if ($realfile =~ m@^include/asm/@) { ERROR("MODIFIED_INCLUDE_ASM", "do not modify files in include/asm, change architecture specific files in include/asm-\n" . "$here$rawline\n"); } next; } $here .= "FILE: $realfile:$realline:" if ($realcnt != 0); my $hereline = "$here\n$rawline\n"; my $herecurr = "$here\n$rawline\n"; my $hereprev = "$here\n$prevrawline\n$rawline\n"; $cnt_lines++ if ($realcnt != 0); # Check for incorrect file permissions if ($line =~ /^new (file )?mode.*[7531]\d{0,2}$/) { my $permhere = $here . "FILE: $realfile\n"; if ($realfile =~ /(Makefile|Kconfig|\.c|\.h|\.S|\.tmpl)$/) { ERROR("EXECUTE_PERMISSIONS", "do not set execute permissions for source files\n" . $permhere); } } # Check the patch for a signoff: if ($line =~ /^\s*signed-off-by:/i) { $signoff++; $in_commit_log = 0; } # Check signature styles if (!$in_header_lines && $line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { my $space_before = $1; my $sign_off = $2; my $space_after = $3; my $email = $4; my $ucfirst_sign_off = ucfirst(lc($sign_off)); if (defined $space_before && $space_before ne "") { WARN("BAD_SIGN_OFF", "Do not use whitespace before $ucfirst_sign_off\n" . $herecurr); } if ($sign_off =~ /-by:$/i && $sign_off ne $ucfirst_sign_off) { WARN("BAD_SIGN_OFF", "'$ucfirst_sign_off' is the preferred signature form\n" . $herecurr); } if (!defined $space_after || $space_after ne " ") { WARN("BAD_SIGN_OFF", "Use a single space after $ucfirst_sign_off\n" . $herecurr); } my ($email_name, $email_address, $comment) = parse_email($email); my $suggested_email = format_email(($email_name, $email_address)); if ($suggested_email eq "") { ERROR("BAD_SIGN_OFF", "Unrecognized email address: '$email'\n" . $herecurr); } else { my $dequoted = $suggested_email; $dequoted =~ s/^"//; $dequoted =~ s/" $comment" ne $email && "$suggested_email$comment" ne $email) { WARN("BAD_SIGN_OFF", "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); } } } # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", "patch seems to be corrupt (line wrapped?)\n" . $herecurr) if (!$emitted_corrupt++); } # Check for absolute kernel paths. if ($tree) { while ($line =~ m{(?:^|\s)(/\S*)}g) { my $file = $1; if ($file =~ m{^(.*?)(?::\d+)+:?$} && check_absolute_file($1, $herecurr)) { # } else { check_absolute_file($file, $herecurr); } } } # UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php if (($realfile =~ /^$/ || $line =~ /^\+/) && $rawline !~ m/^$UTF8*$/) { my ($utf8_prefix) = ($rawline =~ /^($UTF8*)/); my $blank = copy_spacing($rawline); my $ptr = substr($blank, 0, length($utf8_prefix)) . "^"; my $hereptr = "$hereline$ptr\n"; CHK("INVALID_UTF8", "Invalid UTF-8, patch and commit message should be encoded in UTF-8\n" . $hereptr); } # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { $in_header_lines = 0; $in_commit_log = 1; } # Still not yet in a patch, check for any UTF-8 if ($in_commit_log && $realfile =~ /^$/ && $rawline =~ /$NON_ASCII_UTF8/) { CHK("UTF8_BEFORE_PATCH", "8-bit UTF-8 used in possible commit log\n" . $herecurr); } # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); #trailing whitespace if ($line =~ /^\+.*\015/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("DOS_LINE_ENDINGS", "DOS line endings\n" . $herevet); } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet); $rpt_cleaners = 1; } # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/); #80 column limit if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ && $rawline !~ /^.\s*\*\s*\@$Ident\s/ && !($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(KERN_\S+\s*|[^"]*))?"[X\t]*"\s*(?:|,|\)\s*;)\s*$/ || $line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) && $length > 80) { WARN("LONG_LINE", "line over 80 characters\n" . $herecurr); } # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", "unnecessary whitespace before a quoted newline\n" . $herecurr); } # check for adding lines without a newline. if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) { WARN("MISSING_EOF_NEWLINE", "adding a line without newline at end of file\n" . $herecurr); } # check we are in a valid source file C or perl if not then ignore this hunk next if ($realfile !~ /\.(h|c|pl)$/); # at the beginning of a line any tabs must come first and anything # more than 8 must use tabs. if ($rawline =~ /^\+\s* \t\s*\S/ || $rawline =~ /^\+\s* \s*/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("CODE_INDENT", "code indent should use tabs where possible\n" . $herevet); $rpt_cleaners = 1; } # check for space before tabs. if ($rawline =~ /^\+/ && $rawline =~ / \t/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; WARN("SPACE_BEFORE_TAB", "please, no space before tabs\n" . $herevet); } # check for block comment. # # A: # /* foo # * bar */ # B: # /* # * foo # * bar */ # C: # /* # * one-liner # */ # D: # /* one-liner # */ # E: # /* foo # * bar # * baz # */ # above is not preferred # # /* # * This block comments style # * is preferred # */ if ($line =~ /^\+/ && $rawline =~ /\*\/$/ && $rawline !~ /\/\*/) { if ($rawline !~ /^\+\s*\*\/$/) { # case A and B WARN("BLOCK_COMMENT_STYLE", "[BCS] put the trailing */ on a separate line\n" . $hereprev); } elsif ($prevrawline =~ /^\+\s*\/\*/ || $rawlines[$linenr - 3] =~ /^\+\s*\/\*/) { # case C and D WARN("BLOCK_COMMENT_STYLE", "[BCS] don't use block comments for one liner comment\n" . $hereprev); } else { # case E my $ln = $linenr - 1; while ($rawlines[$ln] =~ /^\+/ && $rawlines[$ln] !~ /^\+\s*\/\*/ && $ln >= 0) { $ln--; } if ($rawlines[$ln] =~ /^\+\s*\/\*./) { WARN("BLOCK_COMMENT_STYLE", "[BCS] don't comment at first line in block comments\n" . $hereprev); } } } # check for spaces at the beginning of a line. # Exceptions: # 1) within comments # 2) indented preprocessor commands # 3) hanging labels if ($rawline =~ /^\+ / && $line !~ /\+ *(?:$;|#|$Ident:)/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; WARN("LEADING_SPACE", "please, no spaces at the start of a line\n" . $herevet); } # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", "CVS style keyword markers, these will _not_ be updated\n". $herecurr); } # Check for potential 'bare' types my ($stat, $cond, $line_nr_next, $remain_next, $off_next, $realline_next); #print "LINE<$line>\n"; if ($linenr >= $suppress_statement && $realcnt && $line =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; #print "linenr<$linenr> <$stat>\n"; # If this statement has no statement boundaries within # it there is no point in retrying a statement scan # until we hit end of it. my $frag = $stat; $frag =~ s/;+\s*$//; if ($frag !~ /(?:{|;)/) { #print "skip<$line_nr_next>\n"; $suppress_statement = $line_nr_next; } # Find the real next line. $realline_next = $line_nr_next; if (defined $realline_next && (!defined $lines[$realline_next - 1] || substr($lines[$realline_next - 1], $off_next) =~ /^\s*$/)) { $realline_next++; } my $s = $stat; $s =~ s/{.*$//s; # Ignore goto labels. if ($s =~ /$Ident:\*$/s) { # Ignore functions being called } elsif ($s =~ /^.\s*$Ident\s*\(/s) { } elsif ($s =~ /^.\s*else\b/s) { # declarations always start with types } elsif ($prev_values eq 'E' && $s =~ /^.\s*(?:$Storage\s+)?(?:$Inline\s+)?(?:const\s+)?((?:\s*$Ident)+?)\b(?:\s+$Sparse)?\s*\**\s*(?:$Ident|\(\*[^\)]*\))(?:\s*$Modifier)?\s*(?:;|=|,|\()/s) { my $type = $1; $type =~ s/\s+/ /g; possible($type, "A:" . $s); # definitions in global scope can only start with types } elsif ($s =~ /^.(?:$Storage\s+)?(?:$Inline\s+)?(?:const\s+)?($Ident)\b\s*(?!:)/s) { possible($1, "B:" . $s); } # any (foo ... *) is a pointer cast, and foo is a type while ($s =~ /\(($Ident)(?:\s+$Sparse)*[\s\*]+\s*\)/sg) { possible($1, "C:" . $s); } # Check for any sort of function declaration. # int foo(something bar, other baz); # void (*store_gdt)(x86_descr_ptr *); if ($prev_values eq 'E' && $s =~ /^(.(?:typedef\s*)?(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*(?:\b$Ident|\(\*\s*$Ident\))\s*)\(/s) { my ($name_len) = length($1); my $ctx = $s; substr($ctx, 0, $name_len + 1, ''); $ctx =~ s/\)[^\)]*$//; for my $arg (split(/\s*,\s*/, $ctx)) { if ($arg =~ /^(?:const\s+)?($Ident)(?:\s+$Sparse)*\s*\**\s*(:?\b$Ident)?$/s || $arg =~ /^($Ident)$/s) { possible($1, "D:" . $s); } } } } # # Checks which may be anchored in the context. # # Check for switch () and associated case and default # statements should be at the same indent. if ($line=~/\bswitch\s*\(.*\)/) { my $err = ''; my $sep = ''; my @ctx = ctx_block_outer($linenr, $realcnt); shift(@ctx); for my $ctx (@ctx) { my ($clen, $cindent) = line_stats($ctx); if ($ctx =~ /^\+\s*(case\s+|default:)/ && $indent != $cindent) { $err .= "$sep$ctx\n"; $sep = ''; } else { $sep = "[...]\n"; } } if ($err ne '') { ERROR("SWITCH_CASE_INDENT_LEVEL", "switch and case should be at the same indent\n$hereline$err"); } } # if/while/etc brace do not go on next line, unless defining a do while loop, # or if that brace on the next line is for something else if ($line =~ /(.*)\b((?:if|while|for|switch)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { my $pre_ctx = "$1$2"; my ($level, @ctx) = ctx_statement_level($linenr, $realcnt, 0); if ($line =~ /^\+\t{6,}/) { WARN("DEEP_INDENTATION", "Too many leading tabs - consider code refactoring\n" . $herecurr); } my $ctx_cnt = $realcnt - $#ctx - 1; my $ctx = join("\n", @ctx); my $ctx_ln = $linenr; my $ctx_skip = $realcnt; while ($ctx_skip > $ctx_cnt || ($ctx_skip == $ctx_cnt && defined $lines[$ctx_ln - 1] && $lines[$ctx_ln - 1] =~ /^-/)) { ##print "SKIP<$ctx_skip> CNT<$ctx_cnt>\n"; $ctx_skip-- if (!defined $lines[$ctx_ln - 1] || $lines[$ctx_ln - 1] !~ /^-/); $ctx_ln++; } #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } if ($level == 0 && $pre_ctx !~ /}\s*while\s*\($/ && $ctx =~ /\)\s*\;\s*$/ && defined $lines[$ctx_ln - 1]) { my ($nlength, $nindent) = line_stats($lines[$ctx_ln - 1]); if ($nindent > $indent) { WARN("TRAILING_SEMICOLON", "trailing semicolon indicates no statements, indent implies otherwise\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } } } # Check relative indent for conditionals and blocks. if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); my ($s, $c) = ($stat, $cond); substr($s, 0, length($c), ''); # Make sure we remove the line prefixes as we have # none on the first line, and are going to readd them # where necessary. $s =~ s/\n./\n/gs; # Find out how long the conditional actually is. my @newlines = ($c =~ /\n/gs); my $cond_lines = 1 + $#newlines; # We want to check the first line inside the block # starting at the end of the conditional, so remove: # 1) any blank line termination # 2) any opening brace { on end of the line # 3) any do (...) { my $continuation = 0; my $check = 0; $s =~ s/^.*\bdo\b//; $s =~ s/^\s*{//; if ($s =~ s/^\s*\\//) { $continuation = 1; } if ($s =~ s/^\s*?\n//) { $check = 1; $cond_lines++; } # Also ignore a loop construct at the end of a # preprocessor statement. if (($prevline =~ /^.\s*#\s*define\s/ || $prevline =~ /\\\s*$/) && $continuation == 0) { $check = 0; } my $cond_ptr = -1; $continuation = 0; while ($cond_ptr != $cond_lines) { $cond_ptr = $cond_lines; # If we see an #else/#elif then the code # is not linear. if ($s =~ /^\s*\#\s*(?:else|elif)/) { $check = 0; } # Ignore: # 1) blank lines, they should be at 0, # 2) preprocessor lines, and # 3) labels. if ($continuation || $s =~ /^\s*?\n/ || $s =~ /^\s*#\s*?/ || $s =~ /^\s*$Ident\s*:/) { $continuation = ($s =~ /^.*?\\\n/) ? 1 : 0; if ($s =~ s/^.*?\n//) { $cond_lines++; } } } my (undef, $sindent) = line_stats("+" . $s); my $stat_real = raw_line($linenr, $cond_lines); # Check if either of these lines are modified, else # this is not this patch's fault. if (!defined($stat_real) || $stat !~ /^\+/ && $stat_real !~ /^\+/) { $check = 0; } if (defined($stat_real) && $cond_lines > 1) { $stat_real = "[...]\n$stat_real"; } #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n"; if ($check && (($sindent % 8) != 0 || ($sindent <= $indent && $s ne ''))) { WARN("SUSPECT_CODE_INDENT", "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); } } # Track the 'values' across context and added lines. my $opline = $line; $opline =~ s/^./ /; my ($curr_values, $curr_vars) = annotate_values($opline . "\n", $prev_values); $curr_values = $prev_values . $curr_values; if ($dbg_values) { my $outline = $opline; $outline =~ s/\t/ /g; print "$linenr > .$outline\n"; print "$linenr > $curr_values\n"; print "$linenr > $curr_vars\n"; } $prev_values = substr($curr_values, -1); #ignore lines not being added if ($line=~/^[^\+]/) {next;} # TEST: allow direct testing of the type matcher. if ($dbg_type) { if ($line =~ /^.\s*$Declare\s*$/) { ERROR("TEST_TYPE", "TEST: is type\n" . $herecurr); } elsif ($dbg_type > 1 && $line =~ /^.+($Declare)/) { ERROR("TEST_NOT_TYPE", "TEST: is not type ($1 is)\n". $herecurr); } next; } # TEST: allow direct testing of the attribute matcher. if ($dbg_attr) { if ($line =~ /^.\s*$Modifier\s*$/) { ERROR("TEST_ATTR", "TEST: is attr\n" . $herecurr); } elsif ($dbg_attr > 1 && $line =~ /^.+($Modifier)/) { ERROR("TEST_NOT_ATTR", "TEST: is not attr ($1 is)\n". $herecurr); } next; } # check for initialisation to aggregates open brace on the next line if ($line =~ /^.\s*{/ && $prevline =~ /(?:^|[^=])=\s*$/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . $hereprev); } # # Checks which are anchored on the added line. # # check for malformed paths in #include statements (uses RAW line) if ($rawline =~ m{^.\s*\#\s*include\s+[<"](.*)[">]}) { my $path = $1; if ($path =~ m{//}) { ERROR("MALFORMED_INCLUDE", "malformed #include filename\n" . $herecurr); } } # no C99 // comments if ($line =~ m{//}) { ERROR("C99_COMMENTS", "do not use C99 // comments\n" . $herecurr); } # Remove C99 comments. $line =~ s@//.*@@; $opline =~ s@//.*@@; # check for global initialisers. if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) { ERROR("GLOBAL_INITIALISERS", "do not initialise globals to 0 or NULL\n" . $herecurr); } # check for static initialisers. if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { ERROR("INITIALISED_STATIC", "do not initialise statics to 0 or NULL\n" . $herecurr); } # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY", "static const char * array should probably be static const char * const\n" . $herecurr); } # check for static char foo[] = "bar" declarations. if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) { WARN("STATIC_CONST_CHAR_ARRAY", "static char array declaration should probably be static const char\n" . $herecurr); } # check for declarations of struct pci_device_id if ($line =~ /\bstruct\s+pci_device_id\s+\w+\s*\[\s*\]\s*\=\s*\{/) { WARN("DEFINE_PCI_DEVICE_TABLE", "Use DEFINE_PCI_DEVICE_TABLE for struct pci_device_id\n" . $herecurr); } # check for new typedefs, only function parameters and sparse annotations # make sense. if ($line =~ /\btypedef\s/ && $line !~ /\btypedef\s+$Type\s*\(\s*\*?$Ident\s*\)\s*\(/ && $line !~ /\btypedef\s+$Type\s+$Ident\s*\(/ && $line !~ /\b$typeTypedefs\b/ && $line !~ /\b__bitwise(?:__|)\b/) { WARN("NEW_TYPEDEFS", "do not add new typedefs\n" . $herecurr); } # * goes on variable not on type # (char*[ const]) while ($line =~ m{(\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\))}g) { #print "AA<$1>\n"; my ($from, $to) = ($2, $2); # Should start with a space. $to =~ s/^(\S)/ $1/; # Should not end with a space. $to =~ s/\s+$//; # '*'s should not have spaces between. while ($to =~ s/\*\s+\*/\*\*/) { } #print "from<$from> to<$to>\n"; if ($from ne $to) { ERROR("POINTER_LOCATION", "\"(foo$from)\" should be \"(foo$to)\"\n" . $herecurr); } } while ($line =~ m{(\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident))}g) { #print "BB<$1>\n"; my ($from, $to, $ident) = ($2, $2, $3); # Should start with a space. $to =~ s/^(\S)/ $1/; # Should not end with a space. $to =~ s/\s+$//; # '*'s should not have spaces between. while ($to =~ s/\*\s+\*/\*\*/) { } # Modifiers should have spaces. $to =~ s/(\b$Modifier$)/$1 /; #print "from<$from> to<$to> ident<$ident>\n"; if ($from ne $to && $ident !~ /^$Modifier$/) { ERROR("POINTER_LOCATION", "\"foo${from}bar\" should be \"foo${to}bar\"\n" . $herecurr); } } # function brace can't be on same line, except for #defines of do while, # or if closed on same line if (($line=~/$Type\s*$Ident\(.*\).*\s{/) and !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { ERROR("OPEN_BRACE", "open brace '{' following function declarations go on the next line\n" . $herecurr); } # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { ERROR("OPEN_BRACE", "open brace '{' following $1 go on the same line\n" . $hereprev); } # missing space after union, struct or enum definition if ($line =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?(?:\s+$Ident)?[=\{]/) { WARN("SPACING", "missing space after $1 definition\n" . $herecurr); } # check for spacing round square brackets; allowed: # 1. with a type on the left -- int [] a; # 2. at the beginning of a line for slice initialisers -- [0...10] = 5, # 3. inside a curly brace -- = { [0...10] = 5 } while ($line =~ /(.*?\s)\[/g) { my ($where, $prefix) = ($-[1], $1); if ($prefix !~ /$Type\s+$/ && ($where != 0 || $prefix !~ /^.\s+$/) && $prefix !~ /{\s+$/) { ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr); } } # check for spaces between functions and their parentheses. while ($line =~ /($Ident)\s+\(/g) { my $name = $1; my $ctx_before = substr($line, 0, $-[1]); my $ctx = "$ctx_before$name"; # Ignore those directives where spaces _are_ permitted. if ($name =~ /^(?: if|for|while|switch|return|case| volatile|__volatile__| __attribute__|format|__extension__| asm|__asm__)$/x) { # cpp #define statements have non-optional spaces, ie # if there is a space between the name and the open # parenthesis it is simply not a parameter group. } elsif ($ctx_before =~ /^.\s*\#\s*define\s*$/) { # cpp #elif statement condition may start with a ( } elsif ($ctx =~ /^.\s*\#\s*elif\s*$/) { # If this whole things ends with a type its most # likely a typedef for a function. } elsif ($ctx =~ /$Type$/) { } else { WARN("SPACING", "space prohibited between function name and open parenthesis '('\n" . $herecurr); } } # Check operator spacing. if (!($line=~/\#\s*include/)) { my $ops = qr{ <<=|>>=|<=|>=|==|!=| \+=|-=|\*=|\/=|%=|\^=|\|=|&=| =>|->|<<|>>|<|>|=|!|~| &&|\|\||,|\^|\+\+|--|&|\||\+|-|\*|\/|%| \?|: }x; my @elements = split(/($ops|;)/, $opline); my $off = 0; my $blank = copy_spacing($opline); for (my $n = 0; $n < $#elements; $n += 2) { $off += length($elements[$n]); # Pick up the preceding and succeeding characters. my $ca = substr($opline, 0, $off); my $cc = ''; if (length($opline) >= ($off + length($elements[$n + 1]))) { $cc = substr($opline, $off + length($elements[$n + 1])); } my $cb = "$ca$;$cc"; my $a = ''; $a = 'V' if ($elements[$n] ne ''); $a = 'W' if ($elements[$n] =~ /\s$/); $a = 'C' if ($elements[$n] =~ /$;$/); $a = 'B' if ($elements[$n] =~ /(\[|\()$/); $a = 'O' if ($elements[$n] eq ''); $a = 'E' if ($ca =~ /^\s*$/); my $op = $elements[$n + 1]; my $c = ''; if (defined $elements[$n + 2]) { $c = 'V' if ($elements[$n + 2] ne ''); $c = 'W' if ($elements[$n + 2] =~ /^\s/); $c = 'C' if ($elements[$n + 2] =~ /^$;/); $c = 'B' if ($elements[$n + 2] =~ /^(\)|\]|;)/); $c = 'O' if ($elements[$n + 2] eq ''); $c = 'E' if ($elements[$n + 2] =~ /^\s*\\$/); } else { $c = 'E'; } my $ctx = "${a}x${c}"; my $at = "(ctx:$ctx)"; my $ptr = substr($blank, 0, $off) . "^"; my $hereptr = "$hereline$ptr\n"; # Pull out the value of this operator. my $op_type = substr($curr_values, $off + 1, 1); # Get the full operator variant. my $opv = $op . substr($curr_vars, $off, 1); # Ignore operators passed as parameters. if ($op_type ne 'V' && $ca =~ /\s$/ && $cc =~ /^\s*,/) { # # Ignore comments # } elsif ($op =~ /^$;+$/) { # ; should have either the end of line or a space or \ after it } elsif ($op eq ';') { if ($ctx !~ /.x[WEBC]/ && $cc !~ /^\\/ && $cc !~ /^;/) { ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr); } # // is a comment } elsif ($op eq '//') { # No spaces for: # -> # : when part of a bitfield } elsif ($op eq '->' || $opv eq ':B') { if ($ctx =~ /Wx.|.xW/) { ERROR("SPACING", "spaces prohibited around that '$op' $at\n" . $hereptr); } # , must have a space on the right. } elsif ($op eq ',') { if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr); } # '*' as part of a type definition -- reported already. } elsif ($opv eq '*_') { #warn "'*' is part of type\n"; # unary operators should have a space before and # none after. May be left adjacent to another # unary operator, or a cast } elsif ($op eq '!' || $op eq '~' || $opv eq '*U' || $opv eq '-U' || $opv eq '&U' || $opv eq '&&U') { if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) { ERROR("SPACING", "space required before that '$op' $at\n" . $hereptr); } if ($op eq '*' && $cc =~/\s*$Modifier\b/) { # A unary '*' may be const } elsif ($ctx =~ /.xW/) { ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr); } # unary ++ and unary -- are allowed no space on one side. } elsif ($op eq '++' or $op eq '--') { if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) { ERROR("SPACING", "space required one side of that '$op' $at\n" . $hereptr); } if ($ctx =~ /Wx[BE]/ || ($ctx =~ /Wx./ && $cc =~ /^;/)) { ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr); } if ($ctx =~ /ExW/) { ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr); } # << and >> may either have or not have spaces both sides } elsif ($op eq '<<' or $op eq '>>' or $op eq '&' or $op eq '^' or $op eq '|' or $op eq '+' or $op eq '-' or $op eq '*' or $op eq '/' or $op eq '%') { if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) { ERROR("SPACING", "need consistent spacing around '$op' $at\n" . $hereptr); } # A colon needs no spaces before when it is # terminating a case value or a label. } elsif ($opv eq ':C' || $opv eq ':L') { if ($ctx =~ /Wx./) { ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr); } # All the others need spaces both sides. } elsif ($ctx !~ /[EWC]x[CWE]/) { my $ok = 0; # Ignore email addresses if (($op eq '<' && $cc =~ /^\S+\@\S+>/) || ($op eq '>' && $ca =~ /<\S+\@\S+$/)) { $ok = 1; } # Ignore ?: if (($opv eq ':O' && $ca =~ /\?$/) || ($op eq '?' && $cc =~ /^:/)) { $ok = 1; } if ($ok == 0) { ERROR("SPACING", "spaces required around that '$op' $at\n" . $hereptr); } } $off += length($elements[$n + 1]); } } # check for multiple assignments if ($line =~ /^.\s*$Lval\s*=\s*$Lval\s*=(?!=)/) { CHK("MULTIPLE_ASSIGNMENTS", "multiple assignments should be avoided\n" . $herecurr); } #need space before brace following if, while, etc if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) || $line =~ /do{/) { ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr); } # closing brace should have a space following it when it has anything # on the line if ($line =~ /}(?!(?:,|;|\)))\S/) { ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr); } # check spacing on square brackets if ($line =~ /\[\s/ && $line !~ /\[\s*$/) { ERROR("SPACING", "space prohibited after that open square bracket '['\n" . $herecurr); } if ($line =~ /\s\]/) { ERROR("SPACING", "space prohibited before that close square bracket ']'\n" . $herecurr); } # check spacing on parentheses if ($line =~ /\(\s/ && $line !~ /\(\s*(?:\\)?$/ && $line !~ /for\s*\(\s+;/) { ERROR("SPACING", "space prohibited after that open parenthesis '('\n" . $herecurr); } if ($line =~ /(\s+)\)/ && $line !~ /^.\s*\)/ && $line !~ /for\s*\(.*;\s+\)/ && $line !~ /:\s+\)/) { ERROR("SPACING", "space prohibited before that close parenthesis ')'\n" . $herecurr); } #goto labels aren't indented, allow a single space however if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) { WARN("INDENTED_LABEL", "labels should not be indented\n" . $herecurr); } # Return is not a function. if (defined($stat) && $stat =~ /^.\s*return(\s*)(\(.*);/s) { my $spacing = $1; my $value = $2; # Flatten any parentheses $value =~ s/\(/ \(/g; $value =~ s/\)/\) /g; while ($value =~ s/\[[^\[\]]*\]/1/ || $value !~ /(?:$Ident|-?$Constant)\s* $Compare\s* (?:$Ident|-?$Constant)/x && $value =~ s/\([^\(\)]*\)/1/) { } #print "value<$value>\n"; if ($value =~ /^\s*(?:$Ident|-?$Constant)\s*$/) { ERROR("RETURN_PARENTHESES", "return is not a function, parentheses are not required\n" . $herecurr); } elsif ($spacing !~ /\s+/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); } } # Return of what appears to be an errno should normally be -'ve if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) { my $name = $1; if ($name ne 'EOF' && $name ne 'ERROR') { WARN("USE_NEGATIVE_ERRNO", "return of an errno should typically be -ve (return -$1)\n" . $herecurr); } } # Need a space before open parenthesis after if, while etc if ($line=~/\b(if|while|for|switch)\(/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); } # Check for illegal assignment in if conditional -- and check for trailing # statements after the conditional. if ($line =~ /do\s*(?!{)/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); my ($stat_next) = ctx_statement_block($line_nr_next, $remain_next, $off_next); $stat_next =~ s/\n./\n /g; ##print "stat<$stat> stat_next<$stat_next>\n"; if ($stat_next =~ /^\s*while\b/) { # If the statement carries leading newlines, # then count those as offsets. my ($whitespace) = ($stat_next =~ /^((?:\s*\n[+-])*\s*)/s); my $offset = statement_rawlines($whitespace) - 1; $suppress_whiletrailers{$line_nr_next + $offset} = 1; } } if (!defined $suppress_whiletrailers{$linenr} && $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) { ERROR("ASSIGN_IN_IF", "do not use assignment in if condition\n" . $herecurr); } # Find out what is on the end of the line after the # conditional. substr($s, 0, length($c), ''); $s =~ s/\n.*//g; $s =~ s/$;//g; # Remove any comments if (length($c) && $s !~ /^\s*{?\s*\\*\s*$/ && $c !~ /}\s*while\s*/) { # Find out how long the conditional actually is. my @newlines = ($c =~ /\n/gs); my $cond_lines = 1 + $#newlines; my $stat_real = ''; $stat_real = raw_line($linenr, $cond_lines) . "\n" if ($cond_lines); if (defined($stat_real) && $cond_lines > 1) { $stat_real = "[...]\n$stat_real"; } ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr . $stat_real); } } # Check for bitwise tests written as boolean if ($line =~ / (?: (?:\[|\(|\&\&|\|\|) \s*0[xX][0-9]+\s* (?:\&\&|\|\|) | (?:\&\&|\|\|) \s*0[xX][0-9]+\s* (?:\&\&|\|\||\)|\]) )/x) { WARN("HEXADECIMAL_BOOLEAN_TEST", "boolean test with hexadecimal, perhaps just 1 \& or \|?\n" . $herecurr); } # if and else should not have general statements after it if ($line =~ /^.\s*(?:}\s*)?else\b(.*)/) { my $s = $1; $s =~ s/$;//g; # Remove any comments if ($s !~ /^\s*(?:\sif|(?:{|)\s*\\?\s*$)/) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } } # if should not continue a brace if ($line =~ /}\s*if\b/) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } # case and default should not have general statements after them if ($line =~ /^.\s*(?:case\s*.*|default\s*):/g && $line !~ /\G(?: (?:\s*$;*)(?:\s*{)?(?:\s*$;*)(?:\s*\\)?\s*$| \s*return\s+ )/xg) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } # Check for }else {, these must be at the same # indent level to be relevant to each other. if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ and $previndent == $indent) { ERROR("ELSE_AFTER_BRACE", "else should follow close brace '}'\n" . $hereprev); } if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ and $previndent == $indent) { my ($s, $c) = ctx_statement_block($linenr, $realcnt, 0); # Find out what is on the end of the line after the # conditional. substr($s, 0, length($c), ''); $s =~ s/\n.*//g; if ($s =~ /^\s*;/) { ERROR("WHILE_AFTER_BRACE", "while should follow close brace '}'\n" . $hereprev); } } #no spaces allowed after \ in define if ($line=~/\#\s*define.*\\\s$/) { WARN("WHITESPACE_AFTER_LINE_CONTINUATION", "Whitepspace after \\ makes next lines useless\n" . $herecurr); } # check for redundant bracing round if etc if ($line =~ /(^.*)\bif\b/ && $1 !~ /else\s*$/) { my ($level, $endln, @chunks) = ctx_statement_full($linenr, $realcnt, 1); #print "chunks<$#chunks> linenr<$linenr> endln<$endln> level<$level>\n"; #print "APW: <<$chunks[1][0]>><<$chunks[1][1]>>\n"; if ($#chunks > 0 && $level == 0) { my $allowed = 0; my $seen = 0; my $herectx = $here . "\n"; my $ln = $linenr - 1; for my $chunk (@chunks) { my ($cond, $block) = @{$chunk}; # If the condition carries leading newlines, then count those as offsets. my ($whitespace) = ($cond =~ /^((?:\s*\n[+-])*\s*)/s); my $offset = statement_rawlines($whitespace) - 1; #print "COND<$cond> whitespace<$whitespace> offset<$offset>\n"; # We have looked at and allowed this specific line. $suppress_ifbraces{$ln + $offset} = 1; $herectx .= "$rawlines[$ln + $offset]\n[...]\n"; $ln += statement_rawlines($block) - 1; substr($block, 0, length($cond), ''); $seen++ if ($block =~ /^\s*{/); #print "cond<$cond> block<$block> allowed<$allowed>\n"; if (statement_lines($cond) > 1) { #print "APW: ALLOWED: cond<$cond>\n"; $allowed = 1; } if ($block =~/\b(?:if|for|while)\b/) { #print "APW: ALLOWED: block<$block>\n"; $allowed = 1; } if (statement_block_size($block) > 1) { #print "APW: ALLOWED: lines block<$block>\n"; $allowed = 1; } } if ($seen && !$allowed) { WARN("BRACES", "braces {} are not necessary for any arm of this statement\n" . $herectx); } } } if (!defined $suppress_ifbraces{$linenr - 1} && $line =~ /\b(if|while|for|else)\b/) { my $allowed = 0; # Check the pre-context. if (substr($line, 0, $-[0]) =~ /(\}\s*)$/) { #print "APW: ALLOWED: pre<$1>\n"; $allowed = 1; } my ($level, $endln, @chunks) = ctx_statement_full($linenr, $realcnt, $-[0]); # Check the condition. my ($cond, $block) = @{$chunks[0]}; #print "CHECKING<$linenr> cond<$cond> block<$block>\n"; if (defined $cond) { substr($block, 0, length($cond), ''); } if (statement_lines($cond) > 1) { #print "APW: ALLOWED: cond<$cond>\n"; $allowed = 1; } if ($block =~/\b(?:if|for|while)\b/) { #print "APW: ALLOWED: block<$block>\n"; $allowed = 1; } if (statement_block_size($block) > 1) { #print "APW: ALLOWED: lines block<$block>\n"; $allowed = 1; } # Check the post-context. if (defined $chunks[1]) { my ($cond, $block) = @{$chunks[1]}; if (defined $cond) { substr($block, 0, length($cond), ''); } if ($block =~ /^\s*\{/) { #print "APW: ALLOWED: chunk-1 block<$block>\n"; $allowed = 1; } } if ($level == 0 && $block =~ /^\s*\{/ && !$allowed) { my $herectx = $here . "\n"; my $cnt = statement_rawlines($block); for (my $n = 0; $n < $cnt; $n++) { $herectx .= raw_line($linenr, $n) . "\n"; } WARN("BRACES", "braces {} are not necessary for single statement blocks\n" . $herectx); } } # no volatiles please my $asm_volatile = qr{\b(__asm__|asm)\s+(__volatile__|volatile)\b}; if ($line =~ /\bvolatile\b/ && $line !~ /$asm_volatile/) { WARN("VOLATILE", "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); } # warn about #if 0 if ($line =~ /^.\s*\#\s*if\s+0\b/) { CHK("REDUNDANT_CODE", "if this code is redundant consider removing it\n" . $herecurr); } # warn about spacing in #ifdefs if ($line =~ /^.\s*\#\s*(ifdef|ifndef|elif)\s\s+/) { ERROR("SPACING", "exactly one space required after that #$1\n" . $herecurr); } # Check that the storage class is at the beginning of a declaration if ($line =~ /\b$Storage\b/ && $line !~ /^.\s*$Storage\b/) { WARN("STORAGE_CLASS", "storage class should be at the beginning of the declaration\n" . $herecurr) } # check the location of the inline attribute, that it is between # storage class and type. if ($line =~ /\b$Type\s+$Inline\b/ || $line =~ /\b$Inline\s+$Storage\b/) { ERROR("INLINE_LOCATION", "inline keyword should sit between storage class and type\n" . $herecurr); } # Check for __inline__ and __inline, prefer inline if ($line =~ /\b(__inline__|__inline)\b/) { WARN("INLINE", "plain inline is preferred over $1\n" . $herecurr); } # Check for __attribute__ packed, prefer __packed if ($line =~ /\b__attribute__\s*\(\s*\(.*\bpacked\b/) { WARN("PREFER_PACKED", "__packed is preferred over __attribute__((packed))\n" . $herecurr); } # Check for __attribute__ aligned, prefer __aligned if ($line =~ /\b__attribute__\s*\(\s*\(.*aligned/) { WARN("PREFER_ALIGNED", "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } # Check for __attribute__ format(printf, prefer __printf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { WARN("PREFER_PRINTF", "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); } # check for sizeof(&) if ($line =~ /\bsizeof\s*\(\s*\&/) { WARN("SIZEOF_ADDRESS", "sizeof(& should be avoided\n" . $herecurr); } # check for line continuations in quoted strings with odd counts of " if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) { WARN("LINE_CONTINUATIONS", "Avoid line continuations in quoted strings\n" . $herecurr); } # Check for misused memsets if (defined $stat && $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/s) { my $ms_addr = $2; my $ms_val = $8; my $ms_size = $14; if ($ms_size =~ /^(0x|)0$/i) { ERROR("MEMSET", "memset to 0's uses 0 as the 2nd argument, not the 3rd\n" . "$here\n$stat\n"); } elsif ($ms_size =~ /^(0x|)1$/i) { WARN("MEMSET", "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . "$here\n$stat\n"); } } # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) { my $function_name = $1; my $paren_space = $2; my $s = $stat; if (defined $cond) { substr($s, 0, length($cond), ''); } if ($s =~ /^\s*;/ && $function_name ne 'uninitialized_var') { WARN("AVOID_EXTERNS", "externs should be avoided in .c files\n" . $herecurr); } if ($paren_space =~ /\n/) { WARN("FUNCTION_ARGUMENTS", "arguments for function declarations should follow identifier\n" . $herecurr); } } elsif ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*extern\s+/) { WARN("AVOID_EXTERNS", "externs should be avoided in .c files\n" . $herecurr); } # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { WARN("ONE_SEMICOLON", "Statements terminations use 1 semicolon\n" . $herecurr); } # check for gcc specific __FUNCTION__ if ($line =~ /__FUNCTION__/) { WARN("USE_FUNC", "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); } # check for %L{u,d,i} in strings my $string; while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) { $string = substr($rawline, $-[1], $+[1] - $-[1]); $string =~ s/%%/__/g; if ($string =~ /(?> 3) - ($pos >> 3); $lo .= "\t" x $ntab; $pos = $npos; $nsp = 0; } elsif ($c eq "\n" || $c eq "\r") { $lo .= " " x $nsp; $pos += $nsp; $nsp = 0; $lo .= $c; $pos = 0; } elsif ($c eq " ") { $nsp++; } else { $lo .= " " x $nsp; $pos += $nsp; $nsp = 0; $lo .= $c; $pos++; } } $lo .= " " x $nsp; return $lo; } # Compute the visual width of a string sub strwidth($) { no bytes; # Tab alignment depends on characters my($li) = @_; my($c, $i); my $pos = 0; my $mlen = 0; for ($i = 0; $i < length($li); $i++) { $c = substr($li,$i,1); if ($c eq "\t") { $pos = ($pos+8) & ~7; } elsif ($c eq "\n") { $mlen = $pos if ($pos > $mlen); $pos = 0; } else { $pos++; } } $mlen = $pos if ($pos > $mlen); return $mlen; } $name = basename($0); @files = (); while (defined($a = shift(@ARGV))) { if ($a =~ /^-/) { if ($a eq '-width' || $a eq '-w') { $max_width = shift(@ARGV)+0; } else { print STDERR "Usage: $name [-width #] files...\n"; exit 1; } } else { push(@files, $a); } } foreach $f ( @files ) { print STDERR "$name: $f\n"; if (! -f $f) { print STDERR "$f: not a file\n"; next; } if (!open(FILE, '+<', $f)) { print STDERR "$name: Cannot open file: $f: $!\n"; next; } binmode FILE; # First, verify that it is not a binary file; consider any file # with a zero byte to be a binary file. Is there any better, or # additional, heuristic that should be applied? $is_binary = 0; while (read(FILE, $data, 65536) > 0) { if ($data =~ /\0/) { $is_binary = 1; last; } } if ($is_binary) { print STDERR "$name: $f: binary file\n"; next; } seek(FILE, 0, 0); $in_bytes = 0; $out_bytes = 0; $lineno = 0; @lines = (); $in_hunk = 0; $err = 0; while ( defined($line = ) ) { $lineno++; $in_bytes += length($line); if (!$in_hunk) { if ($line =~ /^\@\@\s+\-([0-9]+),([0-9]+)\s+\+([0-9]+),([0-9]+)\s\@\@/) { $minus_lines = $2; $plus_lines = $4; if ($minus_lines || $plus_lines) { $in_hunk = 1; @hunk_lines = ($line); } } else { push(@lines, $line); $out_bytes += length($line); } } else { # We're in a hunk if ($line =~ /^\+/) { $plus_lines--; $text = substr($line, 1); $text =~ s/[ \t\r]*$//; # Remove trailing spaces $text = clean_space_tabs($text); $l_width = strwidth($text); if ($max_width && $l_width > $max_width) { print STDERR "$f:$lineno: adds line exceeds $max_width ", "characters ($l_width)\n"; } push(@hunk_lines, '+'.$text); } elsif ($line =~ /^\-/) { $minus_lines--; push(@hunk_lines, $line); } elsif ($line =~ /^ /) { $plus_lines--; $minus_lines--; push(@hunk_lines, $line); } else { print STDERR "$name: $f: malformed patch\n"; $err = 1; last; } if ($plus_lines < 0 || $minus_lines < 0) { print STDERR "$name: $f: malformed patch\n"; $err = 1; last; } elsif ($plus_lines == 0 && $minus_lines == 0) { # End of a hunk. Process this hunk. my $i; my $l; my @h = (); my $adj = 0; my $done = 0; for ($i = scalar(@hunk_lines)-1; $i > 0; $i--) { $l = $hunk_lines[$i]; if (!$done && $l eq "+\n") { $adj++; # Skip this line } elsif ($l =~ /^[ +]/) { $done = 1; unshift(@h, $l); } else { unshift(@h, $l); } } $l = $hunk_lines[0]; # Hunk header undef @hunk_lines; # Free memory if ($adj) { die unless ($l =~ /^\@\@\s+\-([0-9]+),([0-9]+)\s+\+([0-9]+),([0-9]+)\s\@\@(.*)$/); my $mstart = $1; my $mlin = $2; my $pstart = $3; my $plin = $4; my $tail = $5; # doesn't include the final newline $l = sprintf("@@ -%d,%d +%d,%d @@%s\n", $mstart, $mlin, $pstart, $plin-$adj, $tail); } unshift(@h, $l); # Transfer to the output array foreach $l (@h) { $out_bytes += length($l); push(@lines, $l); } $in_hunk = 0; } } } if ($in_hunk) { print STDERR "$name: $f: malformed patch\n"; $err = 1; } if (!$err) { if ($in_bytes != $out_bytes) { # Only write to the file if changed seek(FILE, 0, 0); print FILE @lines; if ( !defined($where = tell(FILE)) || !truncate(FILE, $where) ) { die "$name: Failed to truncate modified file: $f: $!\n"; } } } close(FILE); } sheepdog-0.8.3/script/gen_bash_completion.pl000077500000000000000000000073301237656255000212010ustar00rootroot00000000000000#!/usr/bin/perl # # Genrate bash_completion_dog # use strict; my ($program) = @ARGV; print "#!bash\n"; print "\n"; open IN, "$program -h |" or die "cannot find $program\n"; my @help = ; close IN; # Hash of sub command arrays. # E.g. $subcmds{'node'} = [kill, list, info, recovery, md] my %subcmds; # Hash of sub sub command arrays. # E.g. $subsubcmds{'trace graph'} = [cat, stat] my %subsubcmds; # Hash of option arrays. # E.g. $opts{'node list'} = [-a, --address, -p, --port, -r, --raw, -h, --help] my %opts; foreach (@help) { if (/^ (\S+) (\S+)/) { my ($cmd, $subcmd) = ($1, $2); $subcmds{$cmd} = [] if (!defined($subcmds{$cmd})); push @{$subcmds{$cmd}}, $subcmd; $opts{"$cmd $subcmd"} = []; $subsubcmds{"$cmd $subcmd"} = []; # run sub command to get more detailed usage open IN, "$program $cmd $subcmd -h |"; while () { if (/^ (-.), (--\S+)/) { # get options push @{$opts{"$cmd $subcmd"}}, $1; push @{$opts{"$cmd $subcmd"}}, $2; } elsif (/^ ([a-z]+)/) { # get available subcommands push @{$subsubcmds{"$cmd $subcmd"}}, $1; } } close IN; } } foreach my $cmd (keys %subcmds) { my @subcmds = @{$subcmds{$cmd}}; print command($cmd, @subcmds); foreach my $subcmd (@subcmds) { print subcommand($cmd, $subcmd); } } print <<__EOB__; _dog() { local opts cur cmd subcmd opts="@{[keys %subcmds]}" cur="\${COMP_WORDS[COMP_CWORD]}" if [ \$COMP_CWORD -gt 1 ]; then cmd=\${COMP_WORDS[1]} fi if [ \$COMP_CWORD -gt 2 ]; then subcmd=\${COMP_WORDS[2]} fi case "\${cmd}" in __EOB__ foreach my $cmd (keys %subcmds) { print <<__EOB__; $cmd) _dog_$cmd \${subcmd} ;; __EOB__ } print <<__EOB__; "") COMPREPLY=(\$( compgen -W "\${opts}" -- \${cur} )) ;; *) COMPREPLY=() ;; esac } complete -F _dog dog __EOB__ exit 0; # get a completion function for dog command (e.g. _dog_vdi()) sub command { my ($cmd, @subcmds) = @_; my $output; $output = <<__EOB__; _dog_${cmd}() { local opts opts="@subcmds" case "\$1" in __EOB__ foreach my $subcmd (@subcmds) { $output .= <<__EOB__; $subcmd) _dog_${cmd}_${subcmd} ;; __EOB__ } $output .= <<__EOB__; "") COMPREPLY=(\$( compgen \\ -W "\${opts}" \\ -- "\${COMP_WORDS[COMP_CWORD]}" )) ;; *) COMPREPLY=() ;; esac } __EOB__ $output =~ s/\t/ /g; $output =~ s/^ //gm; return $output; } # get a completion function for dog subcommands (e.g. _dog_vdi_create()) sub subcommand { my ($cmd, $subcmd) = @_; my $output; my @opts = @{$opts{"$cmd $subcmd"}}; my @subsubcmds = @{$subsubcmds{"$cmd $subcmd"}}; $output = <<__EOB__; _dog_${cmd}_${subcmd}() { local cur cur="\${COMP_WORDS[COMP_CWORD]}" case "\$cur" in -*) COMPREPLY=(\${COMPREPLY[@]} \\ \$( compgen \\ -W "@opts" \\ __EOB__ $output .= <<__EOB__; -- \${cur} )) ;; __EOB__ if ($cmd eq 'vdi' && $subcmd ne 'create') { $output .= <<__EOB__; *) local dog="\${COMP_WORDS[0]}" local vdilist="\$(\${dog} vdi list -r 2>/dev/null | awk '{print \$2}')" COMPREPLY=(\$( compgen -W "@subsubcmds \${vdilist}" -- \${cur} )) ;; __EOB__ } else { $output .= <<__EOB__; *) COMPREPLY=(\$( compgen -W "@subsubcmds" -- \${cur} )) ;; __EOB__ } $output .= <<__EOB__; esac } __EOB__ $output =~ s/\t/ /g; $output =~ s/^ //gm; return $output; } sheepdog-0.8.3/script/gen_man.pl000077500000000000000000000031371237656255000166070ustar00rootroot00000000000000#!/usr/bin/perl # # Genrate sheepdog manuals from help messages # use strict; my ($cwd) = ($0 =~ m%^(.+/)%); my $program = $ARGV[0]; ## generator functions sub sheep { my ($line) = @_; if ($line =~ /^ ([^,]+), (\S+)\s+(.+)/) { my ($opt, $longopt, $desc) = ($1, $2, $3); print escape(header("$opt, $longopt") . "\n"); print escape("$desc\n"); next if ($opt eq '-h'); # extract detailed help if available my $tmpfile = `mktemp`; chomp($tmpfile); my $help = `$program $tmpfile $opt 2> /dev/null`; unlink $tmpfile; $help =~ s/^\s+\$.+/\n$&\n/mg; print escape("\n$help"); } } sub dog { my ($line) = @_; if ($line =~ /^ (.+?) \s+(.+)/) { my ($cmd, $desc) = ($1, $2); my $help = join '', `$program $cmd -h`; $help =~ s/Usage: dog (.*)/header($1)/e; $help =~ s/^([A-Z][ a-zA-Z]*:)/\n$1/mg; print escape("$help\n"); print escape("Description:\n $desc\n"); } } sub sheepfs { my ($line) = @_; if ($line =~ /^ ([^,]+), (\S+)\s+(.+)/) { my ($opt, $longopt, $desc) = ($1, $2, $3); print escape(header("$opt, $longopt") . "\n"); print escape("$desc\n"); } } ## helper functions sub header { my ($str) = @_; return ".TP\n.BI \"$str\""; } sub escape { my ($str) = @_; $str =~ s/\t/ /g; $str =~ s/\\/\\\\\\/g; $str =~ s/"/\\"/g; $str =~ s/#/\\#/g; $str =~ s/\$/\\\$/g; $str =~ s/\n/\\n/g; return $str; } ## main routine open IN, "$program -h |" or die "cannot find $program\n"; my @help = ; close IN; foreach my $help (@help) { my ($func) = ($program =~ m#.*/(.+)#); chomp($help); eval "$func(\"$help\")"; } sheepdog-0.8.3/script/json_log_viewer.py000077500000000000000000000140171237656255000204120ustar00rootroot00000000000000#! /usr/bin/env python import sys, os, errno import json, curses import atexit begin_sec, begin_usec = -1, -1 class LogRecord(object): def __init__(self, json_line, proc): json_obj = json.loads(json_line) user_info = json_obj['user_info'] self.progname = user_info['program_name'] self.port = user_info['port'] body = json_obj['body'] self.timestamp = { 'sec': body['second'], 'usec': body['usecond']} self.worker_name = body['worker_name'] self.worker_idx = body['worker_idx'] self.func = body['func'] self.line = body['line'] self.msg = body['msg'] self.proc = proc self.color = None def is_sheep(self): return self.progname == 'sheep' def get_color(self): return self.proc.color def pop(self): ret = self.proc.__pop_next_record__() assert ret == self return ret def __lt__(self, other): if self.timestamp['sec'] < other.timestamp['sec']: return True elif other.timestamp['sec'] < self.timestamp['sec']: return False if self.timestamp['usec'] < other.timestamp['usec']: return True return False def format_line(self, max_x): sec = self.timestamp['sec'] usec = self.timestamp['usec'] udelta = usec - begin_usec if udelta < 0: udelta += 1000000 sec -= 1 t = '%d.%06d' % (sec - begin_sec, udelta) ret = '%s+%s: ' % (' ' * (10 - len(t[:10])), t[:10]) if self.progname == 'sheep': hdr = 'sheep %d,%s(%d) ' % \ (self.port, self.func, self.line) ret += hdr[:40] + ' ' * (40 - len(hdr[:40]) + 1) ret += self.msg return ret[:max_x - 1] return self.msg class Process(object): def __init__(self, log_file_path): self.log_file = open(log_file_path) self.next_record = None self.color = None def set_color(self, color): self.color = color def peek_next_record(self): if self.next_record == None: next_line = self.log_file.readline() if next_line == '': # end of the log return None self.next_record = LogRecord(next_line, self) return self.next_record # __pop_next_record__() must be called by LogRecord def __pop_next_record__(self): assert self.next_record != None ret = self.next_record self.next_record = None return ret dying_msg = '' w = None curses_colors = [ curses.COLOR_RED, curses.COLOR_GREEN, curses.COLOR_YELLOW, curses.COLOR_BLUE, curses.COLOR_MAGENTA, curses.COLOR_CYAN, ] nr_curses_colors = len(curses_colors) def init_curses(): global w w = curses.initscr() curses.nonl() curses.cbreak() curses.noecho() curses.start_color() for i in range(1, nr_curses_colors + 1): curses.init_pair(i, curses_colors[i - 1], curses.COLOR_BLACK) def assign_color(procs): sheeps = [] for proc in procs: if proc.peek_next_record().is_sheep(): sheeps.append(proc) nr_sheeps = len(sheeps) if nr_curses_colors < nr_sheeps: # we don't have enough colors to assign... return for i in range(0, nr_sheeps): sheeps[i].set_color(i + 1) current_y = 0 max_y, max_x = 0, 0 records = [] records_len = 0 def unify_records(procs): first_rec = procs[0].peek_next_record() for proc in procs[1:]: rec = proc.peek_next_record() if rec < first_rec: first_rec = rec records.append(first_rec.pop()) global begin_sec, begin_usec begin_sec = first_rec.timestamp['sec'] begin_usec = first_rec.timestamp['usec'] nr_procs = len(procs) is_empty = [False] * nr_procs nr_empteis = 0 while nr_empteis != nr_procs: next_rec = None for i in range(0, nr_procs): if is_empty[i]: continue proc = procs[i] rec = proc.peek_next_record() if rec == None: is_empty[i] = True nr_empteis += 1 continue if next_rec == None: next_rec = rec continue if rec < next_rec: next_rec = rec continue if next_rec == None: assert nr_empteis == nr_procs break records.append(next_rec.pop()) def update_terminal(): w.clear() for i in range(0, max_y): w.move(i, 0) if not current_y + i < records_len: break record = records[current_y + i] color = record.get_color() if color: w.attrset(curses.color_pair(color)) w.addstr(record.format_line(max_x)) if color: w.attroff(curses.color_pair(color)) w.refresh() if __name__ == '__main__': @atexit.register def exit_handler(): curses.endwin() if dying_msg != '': print dying_msg + '\n' init_curses() procs = map(lambda x: Process(x), sys.argv[1:]) assign_color(procs) unify_records(procs) records_len = len(records) tty_file = open('/dev/tty', 'rb') max_y, max_x = w.getmaxyx() update_terminal() running = True while running: try: key = tty_file.read(1) except IOError, (enr, msg): if enr == errno.EINTR: continue dying_msg = 'fatal error: %s' % \ (os.strerror(enr)) break if key == 'q': break elif key == 'j': if current_y + 1 < records_len: current_y += 1 elif key == 'k': if current_y: current_y -= 1 elif key == ' ': if current_y + max_y < records_len: current_y += max_y elif key == 'g': current_y = 0 elif key == 'G': current_y = records_len - max_y update_terminal() sheepdog-0.8.3/script/sheepdog.in000077500000000000000000000050751237656255000167770ustar00rootroot00000000000000#!/bin/bash # chkconfig: - 21 79 # description: Sheepdog # processname: sheep # ### BEGIN INIT INFO # Provides: sheepdog # Required-Start: $network # Should-Start: $syslog # Required-Stop: $network # Default-Start: # Default-Stop: # Short-Description: Starts and stops Sheepdog. # Description: Starts and stops Sheepdog. ### END INIT INFO desc="Sheepdog QEMU/KVM Block Storage" prog="sheep" # set secure PATH PATH="/sbin:/bin:/usr/sbin:/usr/bin:@SBINDIR@" SHEEPDOGD=@SBINDIR@/sheep success() { echo -ne "[ OK ]\r" } failure() { echo -ne "[FAILED]\r" } status() { pid=$(pidof $1 2>/dev/null) rtrn=$? if [ $rtrn -ne 0 ]; then echo "$1 is stopped" else echo "$1 (pid $pid) is running..." fi return $rtrn } # rpm based distros if [ -d @SYSCONFDIR@/sysconfig ]; then [ -f @INITDDIR@/functions ] && . @INITDDIR@/functions [ -f @SYSCONFDIR@/sysconfig/$prog ] && . @SYSCONFDIR@/sysconfig/$prog [ -z "$LOCK_FILE" ] && LOCK_FILE="@LOCALSTATEDIR@/lock/subsys/$prog" fi # deb based distros if [ -d @SYSCONFDIR@/default ]; then [ -f @SYSCONFDIR@/default/$prog ] && . @SYSCONFDIR@/default/$prog [ -z "$LOCK_FILE" ] && LOCK_FILE="@LOCALSTATEDIR@/lock/$prog" fi # The version of __pids_pidof in /etc/init.d/functions calls pidof with -x # This means it matches scripts, including this one. # Redefine it here so that status (from the same file) works. # Otherwise simultaneous calls to stop() will loop forever __pids_pidof() { pidof -c -o $$ -o $PPID -o %PPID "$1" || \ pidof -c -o $$ -o $PPID -o %PPID "${1##*/}" } start() { echo -n "Starting $desc ($prog): " # most recent distributions use tmpfs for @LOCALSTATEDIR@/run # to avoid to clean it up on every boot. # they also assume that init scripts will create # required subdirectories for proper operations mkdir -p @LOCALSTATEDIR@/run if status $prog > /dev/null 2>&1; then success else $prog -p 7000 @LOCALSTATEDIR@/lib/sheepdog > /dev/null 2>&1 # give it time to fail sleep 2 if status $prog > /dev/null 2>&1; then touch $LOCK_FILE success else failure rtrn=1 fi fi echo } stop() { ! status $prog > /dev/null 2>&1 && return echo -n "Stopping $desc ($prog): " killproc $prog RETVAL=$? rm -f $LOCK_FILE success echo } restart() { stop start } rtrn=0 case "$1" in start) start ;; restart|reload|force-reload) restart ;; condrestart|try-restart) if status $prog > /dev/null 2>&1; then restart fi ;; status) status $prog rtrn=$? ;; stop) stop ;; *) echo "usage: $0 {start|stop|restart|reload|force-reload|condrestart|try-restart|status}" rtrn=2 ;; esac exit $rtrn sheepdog-0.8.3/script/vditest000077500000000000000000000226031237656255000162520ustar00rootroot00000000000000#!/usr/bin/perl # # Copyright (C) 2010 MORITA Kazutaka # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License version # 2 as published by the Free Software Foundation. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # use feature 'switch'; use strict; use Getopt::Std; use Time::HiRes qw(gettimeofday); use IPC::Open2; my $program = "vditest"; my ($vdiname, $vdisize); my $concurrency = 1; my $nr_outstanding_aio = 0; my ($lblk, $hblk) = (512, 1048576); my $cache = 'writethrough'; my $runtime = 10; my ($rrate, $wrate) = (100, 0); my $no_act = 0; my $offset = 0; my $seek_pattern = "linear"; my $seed = time(); my ($sblk, $eblk) = (0, 0); my $file = 0; my $flush_interval = 0; my $verbose = 0; my ($read_test, $write_test) = (0,0); my $hbeat = 0; my ($rd_bytes, $wr_bytes, $rd_ops, $wr_ops) = (0, 0, 0, 0); my ($total_rd_bytes, $total_wr_bytes, $total_rd_ops, $total_wr_ops) = (0, 0, 0, 0); $/ = 'qemu-io> '; parse(); print_options(); vdi_open($vdiname, $cache); vdi_main(); vdi_flush(); vdi_close(); sub get_aligned_blk { my ($l, $h) = @_; return $l + 512 * int(rand($h - $l + 512) / 512); } sub to_bytes { my ($size) = @_; given ($size) { when (/k(i?b)?$/i) { $size *= 1024; } when (/m(i?b)?$/i) { $size *= 1024 ** 2; } when (/g(i?b)?$/i) { $size *= 1024 ** 3; } } $_[0] = $size; } sub to_str { my ($size) = @_; my @units = ("", "K", "M", "G", "T", "P", "E", "Z", "Y"); while ($size >= 1024) { shift @units; $size /= 1024; } return sprintf "%.1f%s", $size, $units[0]; } sub print_options { my $opt = "options: "; $opt .= "-B $lblk:$hblk "; $opt .= "-c $cache "; $opt .= "-C $concurrency "; $opt .= "-D $rrate:$wrate "; $opt .= "-n " if $no_act; $opt .= "-o $offset\n"; $opt .= " "; $opt .= "-p $seek_pattern "; $opt .= "-s $seed "; $opt .= "-S $sblk:$eblk "; $opt .= "-T $runtime "; $opt .= "-f $flush_interval\n"; print $opt; } sub print_qemu { my ($cmd) = @_; print $cmd if $verbose; print QEMU $cmd if !$no_act; my $result = ; if ($verbose) { $result =~ s/qemu-io> //; print $result; } while ($result =~ /wrote|read/g) { $nr_outstanding_aio--; } } sub wait_aio_requests { my $old_sep = $/; $/ = "\n"; my $result = ; if ($verbose) { print $result; } while ($result =~ /wrote|read/g) { $nr_outstanding_aio--; } $/ = $old_sep; } sub vdi_open { my ($vdiname, $cache) = @_; my $cmd; return if $no_act; if ($file) { $cmd = "stdbuf -o0 qemu-io -t $cache $vdiname"; } else { $cmd = "stdbuf -o0 qemu-io -t $cache sheepdog:$vdiname"; } open2 *QEMU_OUT, *QEMU, $cmd or die "cannot run qemu-io" if !$no_act; ; } sub vdi_close { print_qemu("quit\n"); close QEMU if !$no_act; } sub vdi_read { my ($offset, $length) = @_; print_qemu("aio_read $offset $length\n"); $nr_outstanding_aio++; $rd_ops++; $rd_bytes += $length; $total_rd_ops++; $total_rd_bytes += $length; } sub vdi_write { my ($offset, $length) = @_; print_qemu("aio_write $offset $length\n"); $nr_outstanding_aio++; $wr_ops++; $wr_bytes += $length; $total_wr_ops++; $total_wr_bytes += $length; } sub vdi_flush { print_qemu("aio_flush\n"); } sub parse_opts { my %opts = (); getopts("?B:c:C:D:f:Fh:no:p:rs:S:T:vw", \%opts) or help(1); foreach my $key (keys %opts) { my $val = $opts{$key}; given ($key) { when ('?') { help(0); } when ('B') { ($lblk, $hblk) = ($val =~ /(\d+[kmg]?):?(\d*[kmg]?)/i); to_bytes($lblk); to_bytes($hblk); $hblk = $lblk if $hblk == 0; die "$lblk is not sector aligned" if $lblk % 512 != 0; die "$lblk is not valid" if $lblk == 0; die "$hblk is not sector aligned" if $hblk % 512 != 0; die "$hblk is too large" if $lblk > (64 * 1024 ** 2); die "transfer range is invalid" if $lblk > $hblk; } when ('c') { if ($val !~ /(none|write(back|through))/) { die "'$val' is not valid"; } $cache = $val; } when ('C') { die "'$val' is not valid" if $val <= 0; $concurrency = $val; } when ('D') { ($rrate, $wrate) = ($val =~ /(\d+)\%?:?(\d*)\%?/); } when ('f') { $flush_interval = $val; } when ('F') { $file = 1; } when ('h') { die "'$val' is not valid" if $val <= 0; $hbeat = $val; } when ('n') { $no_act = 1; $verbose = 1; } when ('o') { die "'$val' is not valid" if $val < 0; $offset = $val; } when ('p') { if ($val =~ /^l/) { $seek_pattern = "linear"; } elsif ($val =~ /^r/) { $seek_pattern = "random"; } else { die "'$val' is not valid"; } } when ('r') { $read_test = 1; if ($write_test) { ($rrate, $wrate) = (50, 50); } else { ($rrate, $wrate) = (100, 0); } } when ('s') { $seed = $val; } when ('S') { ($sblk, $eblk) = ($val =~ /(\d+[kmg]?):?(\d*[kmg]?)/i); to_bytes($sblk); to_bytes($eblk); die "$sblk is not sector aligned" if $sblk % 512 != 0; die "$eblk is not sector aligned" if $eblk % 512 != 0; } when ('T') { die "'$val' is not valid" if $val < 0; $runtime = $val; } when ('v') { $verbose = 1; } when ('w') { $write_test = 1; if ($read_test) { ($rrate, $wrate) = (50, 50); } else { ($rrate, $wrate) = (0, 100); } } } } } sub parse { parse_opts(); if (@ARGV == 0) { die "vdiname must be specified"; } else { $vdiname = shift @ARGV; # process the rest of options parse_opts() if (@ARGV > 0); } die "too many arguments" if @ARGV > 0; if ($file) { $vdisize = `qemu-io -c length $vdiname`; } else { $vdisize = `qemu-io -c length sheepdog:$vdiname`; } to_bytes($vdisize); die "cannot get vdi size" if $vdisize == 0; $eblk = $vdisize if $eblk == 0; die "test block range is invalid" if $sblk >= $eblk; die "transfer size is too large" if $hblk > $eblk - $sblk; } sub vdi_main { my $roffset = $offset; my $woffset = $offset; my ($cur_time, $start_time, $end_time, $hbeat_time); $start_time = $cur_time = get_current_time(); $hbeat_time = $start_time + $hbeat * 1000000; $end_time = $start_time + $runtime * 1000000; srand($seed); while ($cur_time < $end_time) { my $length = get_aligned_blk($lblk, $hblk); while ($nr_outstanding_aio >= $concurrency) { wait_aio_requests(); } if (rand($rrate + $wrate) < $rrate) { # read $length = $eblk - $roffset if $roffset + $length > $eblk; vdi_read($roffset, $length); if ($seek_pattern eq 'linear') { $roffset += $length; $roffset -= $eblk - $sblk while $roffset >= $eblk; } else { $roffset = get_aligned_blk($sblk, $eblk - 512); } } else { # write $length = $eblk - $woffset if $woffset + $length > $eblk; vdi_write($woffset, $length); if ($seek_pattern eq 'linear') { $woffset += $length; $woffset -= $eblk - $sblk while $woffset >= $eblk; } else { $woffset = get_aligned_blk($sblk, $eblk - 512); } if ($flush_interval > 0 && $wr_ops % $flush_interval == 0) { vdi_flush(); } } $cur_time = get_current_time(); if ($hbeat > 0 && $hbeat_time <= $cur_time) { print_result('Heartbeat read', $rd_bytes, $rd_ops, $hbeat) if $rrate; print_result('Heartbeat write', $wr_bytes, $wr_ops, $hbeat) if $wrate; $rd_ops = $wr_ops = 0; $rd_bytes = $wr_bytes = 0; $hbeat_time += $hbeat * 1000000; } } print_result('Total read', $total_rd_bytes, $total_rd_ops, $runtime) if $rrate; print_result('Total write', $total_wr_bytes, $total_wr_ops, $runtime) if $wrate; } sub get_current_time { my ($sec, $microsec) = gettimeofday(); return $sec * 1000000 + $microsec; } sub print_result { my ($label, $bytes, $ops, $t) = @_; printf "$label throughput: %.1fB/s (%s/s), IOPS %.1f/s.\n", $bytes / $t, to_str($bytes / $t), $ops / $t; } sub help { my ($status) = @_; print < seconds. -n print events that would occur but do not access disk. -o offset set the start offset. -p seek_pattern set the pattern of disk seeks. seek_pattern is "linear" or "random". -r read data from vdi. -s seed set seed for random number generation. -S sblk[:eblk] set the start [and stop] test block. -T runtime run until seconds have elapsed. -v verbose mode. -w write data to vdi. END_OF_HELP exit($status); } sheepdog-0.8.3/sheep/000077500000000000000000000000001237656255000144375ustar00rootroot00000000000000sheepdog-0.8.3/sheep/Makefile.am000066400000000000000000000037251237656255000165020ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # MAINTAINERCLEANFILES = Makefile.in AM_CFLAGS = AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include \ $(libcpg_CFLAGS) $(libcfg_CFLAGS) $(libacrd_CFLAGS) sbin_PROGRAMS = sheep sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c \ journal.c ops.c recovery.c cluster/local.c \ object_cache.c object_list_cache.c \ plain_store.c config.c migrate.c md.c if BUILD_HTTP sheep_SOURCES += http/http.c http/kv.c http/s3.c http/swift.c \ http/oalloc.c endif if BUILD_COROSYNC sheep_SOURCES += cluster/corosync.c endif if BUILD_ZOOKEEPER sheep_SOURCES += cluster/zookeeper.c endif if BUILD_SHEPHERD sheep_SOURCES += cluster/shepherd.c endif if BUILD_TRACE AM_CPPFLAGS += -DENABLE_TRACE sheep_SOURCES += trace/trace.c trace/mcount.S trace/graph.c trace/checker.c endif sheep_LDADD = ../lib/libsheepdog.a -lpthread -lm\ $(libcpg_LIBS) $(libcfg_LIBS) $(libacrd_LIBS) $(LIBS) sheep_DEPENDENCIES = ../lib/libsheepdog.a noinst_HEADERS = sheep_priv.h cluster.h http/http.h trace/trace.h EXTRA_DIST = all-local: @echo Built sheep clean-local: rm -f sheep *.o gmon.out *.da *.bb *.bbg # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(sheep_SOURCES) $(noinst_HEADERS) coverage: @lcov -d . -c -o sheep.info sheepdog-0.8.3/sheep/cluster.h000066400000000000000000000132111237656255000162670ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __CLUSTER_H__ #define __CLUSTER_H__ #include #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "config.h" /* * maximum payload size sent in ->notify and ->unblock, it should be large * enough to support COROSYNC_MAX_NODES * struct sd_node */ #define SD_MAX_EVENT_BUF_SIZE (128 * 1024) /* 128k */ struct cluster_driver { const char *name; /* * Initialize the cluster driver * * Returns zero on success, -1 on error. */ int (*init)(const char *option); /* * Get a node ID for this sheep. * * Gets and ID that is used in all communication with other sheep, * which normally would be a string formatted IP address. * * Returns zero on success, -1 on error. */ int (*get_local_addr)(uint8_t *myaddr); /* * Join the cluster * * This function is used to join the cluster, and notifies a join * event to all the nodes. The copy of 'opaque' is passed to * sd_join_handler() and sd_accept_handler(). * * sd_join_handler() must be called on at least one node which already * paticipates in the cluster. If the content of 'opaque' is changed in * sd_join_handler(), the updated 'opaque' must be passed to * sd_accept_handler(). * * Returns zero on success, -1 on error */ int (*join)(const struct sd_node *myself, void *opaque, size_t opaque_len); /* * Leave the cluster * * This function is used to leave the cluster, and notifies a * leave event to all the nodes. The cluster driver calls event * handlers even after this function is called, so the left node can * work as a gateway. * * Returns zero on success, -1 on error */ int (*leave)(void); /* * Notify a message to all nodes in the cluster * * This function sends 'msg' to all the nodes. The notified messages * can be read through sd_notify_handler() and totally ordered with * node change events. * * Returns SD_RES_XXX */ int (*notify)(void *msg, size_t msg_len); /* * Send a message to all nodes to block further events. * * Once the cluster driver has ensured that events are blocked on all * nodes it needs to call sd_block_handler() on the node where ->block * was called. * * Returns SD_RES_XXX */ int (*block)(void); /* * Unblock events on all nodes, and send a total order message * to all nodes. * * Returns SD_RES_XXX */ int (*unblock)(void *msg, size_t msg_len); /* * Acquire the distributed lock. * * Create a distributed mutually exclusive lock to avoid race condition * and try to acquire the lock. * * This function use 'lock_id' as the id of this distributed lock. * A thread can acquire many locks with different lock_id in one * sheep daemon. * * The cluster lock referenced by 'lock' shall be locked by calling * cluster->lock(). If the cluster lock is already locked, the calling * thread shall block until the cluster lock becomes available. */ void (*lock)(uint64_t lock_id); /* * Release the distributed lock. * * If the owner of the cluster lock release it (or the owner is * killed by accident), zookeeper will trigger zk_watch() which will * wake up all waiting threads to compete new owner of the lock * * After all thread unlock, all the resource of this distributed lock * will be released. */ void (*unlock)(uint64_t lock_id); /* * Update the specific node in the driver's private copy of nodes * * Returns SD_RES_XXX */ int (*update_node)(struct sd_node *); struct list_node list; }; extern struct list_head cluster_drivers; #ifdef HAVE_COROSYNC #define DEFAULT_CLUSTER_DRIVER "corosync" #else #define DEFAULT_CLUSTER_DRIVER "local" #endif /* HAVE_COROSYNC */ #define cdrv_register(driver) \ static void __attribute__((constructor)) regist_ ## driver(void) \ { \ if (!driver.init || !driver.join || !driver.leave || !driver.notify) \ panic("the driver '%s' is incomplete", driver.name); \ list_add(&driver.list, &cluster_drivers); \ } #define FOR_EACH_CLUSTER_DRIVER(driver) \ list_for_each_entry(driver, &cluster_drivers, list) static inline struct cluster_driver *find_cdrv(const char *name) { struct cluster_driver *cdrv; int len; FOR_EACH_CLUSTER_DRIVER(cdrv) { len = strlen(cdrv->name); if (strncmp(cdrv->name, name, len) == 0 && (name[len] == ':' || name[len] == '\0')) return cdrv; } return NULL; } static inline const char *get_cdrv_option(const struct cluster_driver *cdrv, const char *arg) { int len = strlen(cdrv->name); if (arg[len] == ':') return strdup(arg + len + 1); else return NULL; } /* callbacks back into sheepdog from the cluster drivers */ void sd_accept_handler(const struct sd_node *joined, const struct rb_root *nroot, size_t nr_members, const void *opaque); void sd_leave_handler(const struct sd_node *left, const struct rb_root *nroot, size_t nr_members); void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len); bool sd_block_handler(const struct sd_node *sender); int sd_reconnect_handler(void); void sd_update_node_handler(struct sd_node *); bool sd_join_handler(const struct sd_node *joining, const struct rb_root *nroot, size_t nr_nodes, void *opaque); #endif sheepdog-0.8.3/sheep/cluster/000077500000000000000000000000001237656255000161205ustar00rootroot00000000000000sheepdog-0.8.3/sheep/cluster/corosync.c000066400000000000000000000451201237656255000201250ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "work.h" #define CPG_INIT_RETRY_CNT 10 #define COROSYNC_MAX_NODES 1024 struct cpg_node { uint32_t nodeid; uint32_t pid; struct sd_node node; }; static cpg_handle_t cpg_handle; static struct cpg_name cpg_group = { 8, "sheepdog" }; static corosync_cfg_handle_t cfg_handle; static struct cpg_node this_node; static LIST_HEAD(corosync_block_event_list); static LIST_HEAD(corosync_nonblock_event_list); static struct cpg_node cpg_nodes[COROSYNC_MAX_NODES]; static size_t nr_cpg_nodes; static bool self_elect; static bool join_finished; static int cpg_fd; static size_t nr_majority; /* used for network partition detection */ /* event types which are dispatched in corosync_dispatch() */ enum corosync_event_type { COROSYNC_EVENT_TYPE_JOIN, COROSYNC_EVENT_TYPE_ACCEPT, COROSYNC_EVENT_TYPE_LEAVE, COROSYNC_EVENT_TYPE_BLOCK, COROSYNC_EVENT_TYPE_NOTIFY, COROSYNC_EVENT_TYPE_UPDATE_NODE, }; /* multicast message type */ enum corosync_message_type { COROSYNC_MSG_TYPE_JOIN, COROSYNC_MSG_TYPE_ACCEPT, COROSYNC_MSG_TYPE_LEAVE, COROSYNC_MSG_TYPE_NOTIFY, COROSYNC_MSG_TYPE_BLOCK, COROSYNC_MSG_TYPE_UNBLOCK, COROSYNC_MSG_TYPE_UPDATE_NODE, }; struct corosync_event { enum corosync_event_type type; struct cpg_node sender; void *msg; size_t msg_len; uint32_t nr_nodes; struct cpg_node nodes[COROSYNC_MAX_NODES]; bool callbacked; struct list_node list; }; struct corosync_message { struct cpg_node sender; enum corosync_message_type type:16; uint16_t nr_nodes; uint32_t msg_len; struct cpg_node nodes[COROSYNC_MAX_NODES]; uint8_t msg[0]; }; static int cpg_node_cmp(struct cpg_node *a, struct cpg_node *b) { int cmp = intcmp(a->nodeid, b->nodeid); if (cmp == 0) cmp = intcmp(a->pid, b->pid); return cmp; } static bool cpg_node_equal(struct cpg_node *a, struct cpg_node *b) { return cpg_node_cmp(a, b) == 0; } static inline int find_sd_node(struct cpg_node *nodes, size_t nr_nodes, struct sd_node *key) { int i; for (i = 0; i < nr_nodes; i++) if (node_eq(&nodes[i].node, key)) return i; return -1; } static inline void add_cpg_node(struct cpg_node *nodes, size_t nr_nodes, struct cpg_node *added) { nodes[nr_nodes++] = *added; } static inline void del_cpg_node(struct cpg_node *nodes, size_t nr_nodes, struct cpg_node *deled) { xlremove(deled, nodes, &nr_nodes, cpg_node_cmp); } static int corosync_get_local_addr(uint8_t *addr) { int ret, nr; corosync_cfg_node_address_t caddr; struct sockaddr_storage *ss = (struct sockaddr_storage *)caddr.address; struct sockaddr_in *sin = (struct sockaddr_in *)caddr.address; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)caddr.address; void *saddr; ret = corosync_cfg_get_node_addrs(cfg_handle, this_node.nodeid, 1, &nr, &caddr); if (ret != CS_OK) { sd_err("failed to get node addresses (%d)", ret); return -1; } if (!nr) { sd_err("no node addresses found"); return -1; } if (ss->ss_family == AF_INET6) { saddr = &sin6->sin6_addr; memcpy(addr, saddr, 16); } else if (ss->ss_family == AF_INET) { saddr = &sin->sin_addr; memset(addr, 0, 16); memcpy(addr + 12, saddr, 4); } else { sd_err("unknown protocol %d", ss->ss_family); return -1; } return 0; } static int send_message(enum corosync_message_type type, struct cpg_node *sender, struct cpg_node *nodes, size_t nr_nodes, void *msg, size_t msg_len) { struct iovec iov[2]; int ret, iov_cnt = 1; size_t mlen = MIN(msg_len, SD_MAX_EVENT_BUF_SIZE); struct corosync_message cmsg = { .type = type, .msg_len = mlen, .sender = *sender, .nr_nodes = nr_nodes, }; if (nodes) memcpy(cmsg.nodes, nodes, sizeof(*nodes) * nr_nodes); iov[0].iov_base = &cmsg; iov[0].iov_len = sizeof(cmsg); if (msg) { iov[1].iov_base = msg; iov[1].iov_len = mlen; iov_cnt++; } retry: ret = cpg_mcast_joined(cpg_handle, CPG_TYPE_AGREED, iov, iov_cnt); switch (ret) { case CS_OK: break; case CS_ERR_TRY_AGAIN: sd_debug("failed to send message: retrying"); sleep(1); goto retry; default: sd_err("failed to send message (%d)", ret); return SD_RES_CLUSTER_ERROR; } return SD_RES_SUCCESS; } static inline struct corosync_event * find_block_event(enum corosync_event_type type, struct cpg_node *sender) { struct corosync_event *cevent; list_for_each_entry(cevent, &corosync_block_event_list, list) { if (cevent->type == type && cpg_node_equal(&cevent->sender, sender)) return cevent; } return NULL; } static inline struct corosync_event * find_nonblock_event(enum corosync_event_type type, struct cpg_node *sender) { struct corosync_event *cevent; list_for_each_entry(cevent, &corosync_nonblock_event_list, list) { if (cevent->type == type && cpg_node_equal(&cevent->sender, sender)) return cevent; } return NULL; } static inline struct corosync_event * find_event(enum corosync_event_type type, struct cpg_node *sender) { if (type == COROSYNC_EVENT_TYPE_BLOCK) return find_block_event(type, sender); else return find_nonblock_event(type, sender); } static void build_node_list(struct cpg_node *nodes, size_t nr_nodes, struct rb_root *nroot) { for (int i = 0; i < nr_nodes; i++) rb_insert(nroot, &nodes[i].node, rb, node_cmp); } /* * Process one dispatch event * * Returns true if the event is processed */ static bool __corosync_dispatch_one(struct corosync_event *cevent) { struct sd_node *node; struct cpg_node *n; struct rb_root nroot = RB_ROOT; int idx; switch (cevent->type) { case COROSYNC_EVENT_TYPE_JOIN: if (!cevent->msg) /* we haven't receive JOIN yet */ return false; if (cevent->callbacked) /* sd_join_handler() must be called only once */ return false; build_node_list(cpg_nodes, nr_cpg_nodes, &nroot); if (sd_join_handler(&cevent->sender.node, &nroot, nr_cpg_nodes, cevent->msg)) { send_message(COROSYNC_MSG_TYPE_ACCEPT, &cevent->sender, cpg_nodes, nr_cpg_nodes, cevent->msg, cevent->msg_len); cevent->callbacked = true; } return false; case COROSYNC_EVENT_TYPE_ACCEPT: add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender); nr_cpg_nodes++; build_node_list(cpg_nodes, nr_cpg_nodes, &nroot); sd_accept_handler(&cevent->sender.node, &nroot, nr_cpg_nodes, cevent->msg); break; case COROSYNC_EVENT_TYPE_LEAVE: n = xlfind(&cevent->sender, cpg_nodes, nr_cpg_nodes, cpg_node_cmp); if (n == NULL) break; cevent->sender.node = n->node; del_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender); nr_cpg_nodes--; build_node_list(cpg_nodes, nr_cpg_nodes, &nroot); sd_leave_handler(&cevent->sender.node, &nroot, nr_cpg_nodes); break; case COROSYNC_EVENT_TYPE_BLOCK: if (cevent->callbacked) /* * block events until the unblock message * removes this event */ return false; cevent->callbacked = sd_block_handler(&cevent->sender.node); return false; case COROSYNC_EVENT_TYPE_NOTIFY: sd_notify_handler(&cevent->sender.node, cevent->msg, cevent->msg_len); break; case COROSYNC_EVENT_TYPE_UPDATE_NODE: node = &cevent->sender.node; if (cpg_node_equal(&cevent->sender, &this_node)) this_node = cevent->sender; idx = find_sd_node(cpg_nodes, nr_cpg_nodes, node); assert(idx >= 0); cpg_nodes[idx].node = *node; sd_update_node_handler(node); break; } return true; } static bool update_join_status(struct corosync_event *cevent) { if (join_finished) return true; switch (cevent->type) { case COROSYNC_EVENT_TYPE_JOIN: if (self_elect) { nr_cpg_nodes = 0; return true; } break; case COROSYNC_EVENT_TYPE_ACCEPT: if (cpg_node_equal(&cevent->sender, &this_node)) { nr_cpg_nodes = cevent->nr_nodes; memcpy(cpg_nodes, cevent->nodes, sizeof(*cevent->nodes) * cevent->nr_nodes); return true; } break; default: break; } return false; } static void __corosync_dispatch(void) { struct corosync_event *cevent; struct pollfd pfd = { .fd = cpg_fd, .events = POLLIN, }; if (poll(&pfd, 1, 0)) { /* * Corosync dispatches leave events one by one even * when network partition has occured. To count the * number of alive nodes correctly, we postpone * processsing events if there are incoming ones. */ sd_debug("wait for a next dispatch event"); return; } nr_majority = 0; while (!list_empty(&corosync_block_event_list) || !list_empty(&corosync_nonblock_event_list)) { if (!list_empty(&corosync_nonblock_event_list)) cevent = list_first_entry(&corosync_nonblock_event_list, typeof(*cevent), list); else cevent = list_first_entry(&corosync_block_event_list, typeof(*cevent), list); join_finished = update_join_status(cevent); if (join_finished) { if (!__corosync_dispatch_one(cevent)) return; } else { switch (cevent->type) { case COROSYNC_MSG_TYPE_JOIN: case COROSYNC_MSG_TYPE_BLOCK: return; default: break; } } list_del(&cevent->list); free(cevent->msg); free(cevent); } } static struct corosync_event * update_event(enum corosync_event_type type, struct cpg_node *sender, void *msg, size_t msg_len) { struct corosync_event *cevent; cevent = find_event(type, sender); if (!cevent) /* block message was casted before this node joins */ return NULL; cevent->msg_len = msg_len; if (msg_len) { cevent->msg = realloc(cevent->msg, msg_len); if (!cevent->msg) panic("failed to allocate memory"); memcpy(cevent->msg, msg, msg_len); } else { free(cevent->msg); cevent->msg = NULL; } return cevent; } static void queue_event(struct corosync_event *cevent) { if (cevent->type == COROSYNC_EVENT_TYPE_BLOCK) list_add_tail(&cevent->list, &corosync_block_event_list); else list_add_tail(&cevent->list, &corosync_nonblock_event_list); } static void cdrv_cpg_deliver(cpg_handle_t handle, const struct cpg_name *group_name, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { struct corosync_event *cevent; struct corosync_message *cmsg = msg; sd_debug("%d", cmsg->type); switch (cmsg->type) { case COROSYNC_MSG_TYPE_JOIN: cevent = update_event(COROSYNC_EVENT_TYPE_JOIN, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (!cevent) break; cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; break; case COROSYNC_MSG_TYPE_UNBLOCK: cevent = update_event(COROSYNC_EVENT_TYPE_BLOCK, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (cevent) { list_del(&cevent->list); free(cevent->msg); free(cevent); } /* fall through */ case COROSYNC_MSG_TYPE_BLOCK: case COROSYNC_MSG_TYPE_NOTIFY: case COROSYNC_MSG_TYPE_UPDATE_NODE: cevent = xzalloc(sizeof(*cevent)); switch (cmsg->type) { case COROSYNC_MSG_TYPE_BLOCK: cevent->type = COROSYNC_EVENT_TYPE_BLOCK; break; case COROSYNC_MSG_TYPE_UPDATE_NODE: cevent->type = COROSYNC_EVENT_TYPE_UPDATE_NODE; break; default: cevent->type = COROSYNC_EVENT_TYPE_NOTIFY; break; } cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; if (cmsg->msg_len) { cevent->msg = xzalloc(cmsg->msg_len); memcpy(cevent->msg, cmsg->msg, cmsg->msg_len); } else cevent->msg = NULL; queue_event(cevent); break; case COROSYNC_MSG_TYPE_LEAVE: cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_LEAVE; cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; if (cmsg->msg_len) { cevent->msg = xzalloc(cmsg->msg_len); memcpy(cevent->msg, cmsg->msg, cmsg->msg_len); } else cevent->msg = NULL; queue_event(cevent); break; case COROSYNC_MSG_TYPE_ACCEPT: cevent = update_event(COROSYNC_EVENT_TYPE_JOIN, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (!cevent) break; cevent->type = COROSYNC_EVENT_TYPE_ACCEPT; cevent->nr_nodes = cmsg->nr_nodes; memcpy(cevent->nodes, cmsg->nodes, sizeof(*cmsg->nodes) * cmsg->nr_nodes); break; } __corosync_dispatch(); } static void build_cpg_node_list(struct cpg_node *nodes, const struct cpg_address *list, size_t nr) { int i; for (i = 0; i < nr; i++) { nodes[i].nodeid = list[i].nodeid; nodes[i].pid = list[i].pid; } } static void cdrv_cpg_confchg(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { struct corosync_event *cevent; int i; struct cpg_node member_sheep[COROSYNC_MAX_NODES]; struct cpg_node joined_sheep[COROSYNC_MAX_NODES]; struct cpg_node left_sheep[COROSYNC_MAX_NODES]; bool promote = true; sd_debug("mem:%zu, joined:%zu, left:%zu", member_list_entries, joined_list_entries, left_list_entries); /* check network partition */ if (left_list_entries) { if (nr_majority == 0) { size_t total = member_list_entries + left_list_entries; /* * we need at least 3 nodes to handle network * partition failure */ if (total > 2) nr_majority = total / 2 + 1; } if (member_list_entries == 0) panic("NIC failure?"); if (member_list_entries < nr_majority) panic("Network partition is detected"); } /* convert cpg_address to cpg_node */ build_cpg_node_list(member_sheep, member_list, member_list_entries); build_cpg_node_list(left_sheep, left_list, left_list_entries); build_cpg_node_list(joined_sheep, joined_list, joined_list_entries); /* dispatch leave_handler */ for (i = 0; i < left_list_entries; i++) { cevent = find_event(COROSYNC_EVENT_TYPE_JOIN, left_sheep + i); if (cevent) { /* the node left before joining */ list_del(&cevent->list); free(cevent->msg); free(cevent); continue; } cevent = find_event(COROSYNC_EVENT_TYPE_BLOCK, left_sheep + i); if (cevent) { /* the node left before sending UNBLOCK */ list_del(&cevent->list); free(cevent->msg); free(cevent); } cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_LEAVE; cevent->sender = left_sheep[i]; queue_event(cevent); } /* dispatch join_handler */ for (i = 0; i < joined_list_entries; i++) { cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_JOIN; cevent->sender = joined_sheep[i]; queue_event(cevent); } if (!join_finished) { /* * Exactly one non-master member has seen join events for * all other members, because events are ordered. */ for (i = 0; i < member_list_entries; i++) { cevent = find_event(COROSYNC_EVENT_TYPE_JOIN, &member_sheep[i]); if (!cevent) { sd_debug("Not promoting because member is not " "in our event list."); promote = false; break; } } /* * If we see the join events for all nodes promote ourself to * master right here. */ if (promote) self_elect = true; } __corosync_dispatch(); } static int corosync_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int ret; retry: ret = cpg_join(cpg_handle, &cpg_group); switch (ret) { case CS_OK: break; case CS_ERR_TRY_AGAIN: sd_debug("failed to join the sheepdog group: retrying"); sleep(1); goto retry; case CS_ERR_SECURITY: sd_err("permission denied to join the sheepdog group"); return -1; default: sd_err("failed to join the sheepdog group (%d)", ret); return -1; } this_node.node = *myself; ret = send_message(COROSYNC_MSG_TYPE_JOIN, &this_node, NULL, 0, opaque, opaque_len); return ret; } static int corosync_leave(void) { return send_message(COROSYNC_MSG_TYPE_LEAVE, &this_node, NULL, 0, NULL, 0); } static int corosync_block(void) { return send_message(COROSYNC_MSG_TYPE_BLOCK, &this_node, NULL, 0, NULL, 0); } static int corosync_unblock(void *msg, size_t msg_len) { return send_message(COROSYNC_MSG_TYPE_UNBLOCK, &this_node, NULL, 0, msg, msg_len); } static int corosync_notify(void *msg, size_t msg_len) { return send_message(COROSYNC_MSG_TYPE_NOTIFY, &this_node, NULL, 0, msg, msg_len); } static void corosync_handler(int listen_fd, int events, void *data) { int ret; if (events & EPOLLHUP) { sd_err("corosync driver received EPOLLHUP event, exiting."); goto out; } ret = cpg_dispatch(cpg_handle, CS_DISPATCH_ALL); if (ret != CS_OK) { sd_err("cpg_dispatch returned %d", ret); goto out; } return; out: log_close(); exit(1); } static int corosync_init(const char *option) { int ret, retry_cnt = 0; uint32_t nodeid; cpg_callbacks_t cb = { .cpg_deliver_fn = cdrv_cpg_deliver, .cpg_confchg_fn = cdrv_cpg_confchg }; again: ret = cpg_initialize(&cpg_handle, &cb); switch (ret) { case CS_OK: /* success */ break; case CS_ERR_TRY_AGAIN: if (retry_cnt++ == CPG_INIT_RETRY_CNT) { sd_err("failed to initialize cpg (%d) - " "is corosync running?", ret); return -1; } sd_debug("retry cpg_initialize"); usleep(200000); goto again; case CS_ERR_SECURITY: sd_err("failed to initialize cpg - permission denied"); return -1; default: sd_err("failed to initialize cpg (%d)", ret); return -1; } ret = corosync_cfg_initialize(&cfg_handle, NULL); if (ret != CS_OK) { sd_err("failed to initialize cfg (%d)", ret); return -1; } ret = corosync_cfg_local_get(cfg_handle, &nodeid); if (ret != CS_OK) { sd_err("failed to get node id (%d)", ret); return -1; } this_node.nodeid = nodeid; this_node.pid = getpid(); ret = cpg_fd_get(cpg_handle, &cpg_fd); if (ret != CS_OK) { sd_err("failed to get cpg file descriptor (%d)", ret); return -1; } ret = register_event(cpg_fd, corosync_handler, NULL); if (ret) { sd_err("failed to register corosync event handler (%d)", ret); return -1; } return 0; } static void corosync_lock(uint64_t lock_id) { } static void corosync_unlock(uint64_t lock_id) { } static int corosync_update_node(struct sd_node *node) { struct cpg_node cnode = this_node; cnode.node = *node; return send_message(COROSYNC_MSG_TYPE_UPDATE_NODE, &cnode, NULL, 0, NULL, 0); } static struct cluster_driver cdrv_corosync = { .name = "corosync", .init = corosync_init, .get_local_addr = corosync_get_local_addr, .join = corosync_join, .leave = corosync_leave, .notify = corosync_notify, .block = corosync_block, .unblock = corosync_unblock, .lock = corosync_lock, .unlock = corosync_unlock, .update_node = corosync_update_node, }; cdrv_register(cdrv_corosync); sheepdog-0.8.3/sheep/cluster/local.c000066400000000000000000000366341237656255000173720ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "work.h" #include "util.h" #include "rbtree.h" #define MAX_EVENTS 500 #define PROCESS_CHECK_INTERVAL 50 /* ms */ #define LOCAL_MAX_NODES 1024 static const char *shmfile = "/tmp/sheepdog_shm"; static const char *lockdir = "/tmp/sheepdog_locks/"; /* use lock_tree to find lock quickly */ static struct rb_root lock_tree_root = RB_ROOT; /* use global_lock to protect lock_tree */ static struct sd_mutex *global_lock; /* * a lock may be used by several processes(or threads) at the same time, * so we should add 'ref' to avoid one process release a lock which * still used by another process. */ struct lock_entry { struct rb_node rb; int fd; uint64_t lock_id; uint64_t ref; struct sd_mutex *mutex; }; static int shmfd; static int sigfd; static int block_event_pos; static int nonblock_event_pos; static struct local_node this_node; static bool joined; struct local_node { struct sd_node node; pid_t pid; bool gateway; }; static const char *lnode_to_str(struct local_node *lnode) { static __thread char s[MAX_NODE_STR_LEN + 32]; snprintf(s, sizeof(s), "%s pid:%d", node_to_str(&lnode->node), lnode->pid); return s; } static int lnode_cmp(const struct local_node *a, const struct local_node *b) { return node_cmp(&a->node, &b->node); } static bool lnode_eq(const struct local_node *a, const struct local_node *b) { return lnode_cmp(a, b) == 0; } enum local_event_type { EVENT_JOIN = 1, EVENT_ACCEPT, EVENT_LEAVE, EVENT_GATEWAY, EVENT_BLOCK, EVENT_NOTIFY, EVENT_UPDATE_NODE, }; struct local_event { enum local_event_type type; struct local_node sender; bool callbacked; bool removed; size_t buf_len; uint8_t buf[SD_MAX_EVENT_BUF_SIZE]; size_t nr_lnodes; /* the number of sheep processes */ struct local_node lnodes[LOCAL_MAX_NODES]; }; /* shared memory queue */ static struct shm_queue { int block_event_pos; struct local_event block_events[MAX_EVENTS]; int nonblock_event_pos; struct local_event nonblock_events[MAX_EVENTS]; } *shm_queue; static inline void node_insert(struct sd_node *new, struct rb_root *root) { if (rb_insert(root, new, rb, node_cmp)) panic("insert duplicate %s", node_to_str(new)); } static int xflock(int fd, int operation) { int ret; do { ret = flock(fd, operation); } while (ret < 0 && (errno == EAGAIN || errno == EINTR)); return ret; } static void shm_queue_lock(void) { xflock(shmfd, LOCK_EX); } static void shm_queue_unlock(void) { xflock(shmfd, LOCK_UN); } static size_t get_nodes(struct local_node *n) { struct local_event *ev; ev = shm_queue->nonblock_events + shm_queue->nonblock_event_pos; if (n) memcpy(n, ev->lnodes, sizeof(ev->lnodes)); return ev->nr_lnodes; } static int process_exists(pid_t pid) { return kill(pid, 0) == 0; } static struct local_event *shm_queue_peek_block_event(void) { return shm_queue->block_events + (block_event_pos + 1) % MAX_EVENTS; } static struct local_event *shm_queue_peek_nonblock_event(void) { return shm_queue->nonblock_events + (nonblock_event_pos + 1) % MAX_EVENTS; } static struct local_event *shm_queue_peek(void) { /* try to peek nonblock queue first */ if (nonblock_event_pos != shm_queue->nonblock_event_pos) return shm_queue_peek_nonblock_event(); else if (block_event_pos != shm_queue->block_event_pos) return shm_queue_peek_block_event(); else return NULL; } static void shm_queue_push(struct local_event *ev) { int pos; if (ev->type == EVENT_BLOCK) { pos = (shm_queue->block_event_pos + 1) % MAX_EVENTS; shm_queue->block_events[pos] = *ev; msync(shm_queue->block_events + pos, sizeof(*ev), MS_SYNC); shm_queue->block_event_pos = pos; msync(&shm_queue->block_event_pos, sizeof(pos), MS_SYNC); } else { pos = (shm_queue->nonblock_event_pos + 1) % MAX_EVENTS; shm_queue->nonblock_events[pos] = *ev; msync(shm_queue->nonblock_events + pos, sizeof(*ev), MS_SYNC); shm_queue->nonblock_event_pos = pos; msync(&shm_queue->nonblock_event_pos, sizeof(pos), MS_SYNC); } } static void shm_queue_remove(struct local_event *ev) { if (ev == shm_queue_peek_block_event()) block_event_pos = (block_event_pos + 1) % MAX_EVENTS; else nonblock_event_pos = (nonblock_event_pos + 1) % MAX_EVENTS; } static void shm_queue_notify(void) { int i; size_t nr; struct local_node lnodes[LOCAL_MAX_NODES]; nr = get_nodes(lnodes); for (i = 0; i < nr; i++) { sd_debug("send signal to %s", lnode_to_str(lnodes + i)); kill(lnodes[i].pid, SIGUSR1); } } static bool is_shm_queue_valid(void) { int i; size_t nr; struct local_node lnodes[LOCAL_MAX_NODES]; nr = get_nodes(lnodes); if (nr == 0) return true; for (i = 0; i < nr; i++) if (process_exists(lnodes[i].pid)) return true; return false; } static void shm_queue_init(void) { int ret; shmfd = open(shmfile, O_CREAT | O_RDWR, 0644); if (shmfd < 0) panic("cannot open shared file, %s", shmfile); shm_queue_lock(); ret = xftruncate(shmfd, sizeof(*shm_queue)); if (ret != 0) panic("failed to truncate shmfile, %m"); shm_queue = mmap(NULL, sizeof(*shm_queue), PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0); if (shm_queue == MAP_FAILED) panic("mmap error, %m"); if (is_shm_queue_valid()) { block_event_pos = shm_queue->block_event_pos; nonblock_event_pos = shm_queue->nonblock_event_pos; } else { /* initialize shared memory */ block_event_pos = 0; nonblock_event_pos = 0; ret = xftruncate(shmfd, 0); if (ret != 0) panic("failed to truncate shmfile, %m"); ret = xftruncate(shmfd, sizeof(*shm_queue)); if (ret != 0) panic("failed to truncate shmfile, %m"); } shm_queue_unlock(); } static int add_event(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { struct local_node *n; struct local_event ev = { .type = type, .sender = *lnode, }; buf_len = MIN(buf_len, SD_MAX_EVENT_BUF_SIZE); ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); ev.nr_lnodes = get_nodes(ev.lnodes); switch (type) { case EVENT_JOIN: ev.lnodes[ev.nr_lnodes] = *lnode; ev.nr_lnodes++; break; case EVENT_LEAVE: xlremove(lnode, ev.lnodes, &ev.nr_lnodes, lnode_cmp); break; case EVENT_GATEWAY: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->gateway = true; break; case EVENT_NOTIFY: case EVENT_BLOCK: break; case EVENT_UPDATE_NODE: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->node = lnode->node; break; case EVENT_ACCEPT: abort(); } sd_debug("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender)); for (int i = 0; i < ev.nr_lnodes; i++) sd_debug("%d: %s", i, lnode_to_str(ev.lnodes + i)); shm_queue_push(&ev); shm_queue_notify(); return SD_RES_SUCCESS; } static int add_event_lock(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { int ret; shm_queue_lock(); ret = add_event(type, lnode, buf, buf_len); shm_queue_unlock(); return ret; } static void check_pids(void *arg) { int i; size_t nr; struct local_node lnodes[LOCAL_MAX_NODES]; struct local_event *ev; shm_queue_lock(); nr = get_nodes(lnodes); for (i = 0; i < nr; i++) if (!process_exists(lnodes[i].pid)) { add_event(EVENT_LEAVE, lnodes + i, NULL, 0); /* unblock blocking event if sender has gone */ ev = shm_queue_peek_block_event(); if (lnode_eq(lnodes + i, &ev->sender)) { ev->removed = true; msync(ev, sizeof(*ev), MS_SYNC); } } shm_queue_unlock(); add_timer(arg, PROCESS_CHECK_INTERVAL); } /* Local driver APIs */ static int local_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { this_node.node = *myself; this_node.pid = getpid(); this_node.gateway = false; return add_event_lock(EVENT_JOIN, &this_node, opaque, opaque_len); } static int local_leave(void) { return add_event_lock(EVENT_GATEWAY, &this_node, NULL, 0); } static int local_notify(void *msg, size_t msg_len) { return add_event_lock(EVENT_NOTIFY, &this_node, msg, msg_len); } static int local_block(void) { return add_event_lock(EVENT_BLOCK, &this_node, NULL, 0); } static int local_unblock(void *msg, size_t msg_len) { struct local_event *ev; shm_queue_lock(); ev = shm_queue_peek_block_event(); ev->removed = true; msync(ev, sizeof(*ev), MS_SYNC); add_event(EVENT_NOTIFY, &this_node, msg, msg_len); shm_queue_unlock(); return SD_RES_SUCCESS; } /* Returns true if an event is processed */ static bool local_process_event(void) { struct local_event *ev; int i; struct rb_root root = RB_ROOT; size_t nr_nodes = 0; ev = shm_queue_peek(); if (!ev) return false; sd_debug("type = %d, sender = %s", ev->type, lnode_to_str(&ev->sender)); sd_debug("callbacked = %d, removed = %d", ev->callbacked, ev->removed); if (ev->removed) goto out; if (ev->callbacked) return false; /* wait for unblock event */ if (!joined) { if (!lnode_eq(&this_node, &ev->sender)) goto out; switch (ev->type) { case EVENT_JOIN: break; case EVENT_ACCEPT: sd_debug("join Sheepdog"); joined = true; break; default: goto out; } } for (i = 0; i < ev->nr_lnodes; i++) { sd_debug("%d: %s", i, lnode_to_str(ev->lnodes + i)); if (!ev->lnodes[i].gateway) { node_insert(&ev->lnodes[i].node, &root); nr_nodes++; } } switch (ev->type) { case EVENT_JOIN: for (i = 0; i < ev->nr_lnodes; i++) if (node_eq(&ev->sender.node, &ev->lnodes[i].node)) { rb_erase(&ev->lnodes[i].node.rb, &root); nr_nodes--; } if (sd_join_handler(&ev->sender.node, &root, nr_nodes, ev->buf)) { ev->type = EVENT_ACCEPT; msync(ev, sizeof(*ev), MS_SYNC); shm_queue_notify(); } return false; case EVENT_ACCEPT: sd_accept_handler(&ev->sender.node, &root, nr_nodes, ev->buf); break; case EVENT_LEAVE: if (ev->sender.gateway) { sd_debug("gateway %s left sheepdog", lnode_to_str(&ev->sender)); break; } /* fall through */ case EVENT_GATEWAY: sd_leave_handler(&ev->sender.node, &root, nr_nodes); break; case EVENT_BLOCK: ev->callbacked = sd_block_handler(&ev->sender.node); msync(ev, sizeof(*ev), MS_SYNC); return false; case EVENT_NOTIFY: sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); break; case EVENT_UPDATE_NODE: if (lnode_eq(&ev->sender, &this_node)) this_node = ev->sender; sd_update_node_handler(&ev->sender.node); break; } out: shm_queue_remove(ev); return true; } static void local_handler(int listen_fd, int events, void *data) { struct signalfd_siginfo siginfo; int ret; if (events & EPOLLHUP) { sd_err("local driver received EPOLLHUP event, exiting."); log_close(); exit(1); } sd_debug("read siginfo"); ret = read(sigfd, &siginfo, sizeof(siginfo)); if (ret != sizeof(siginfo)) panic("failed to read from sigfd, %m"); shm_queue_lock(); while (local_process_event()) ; shm_queue_unlock(); } static int local_get_local_addr(uint8_t *myaddr) { /* set 127.0.0.1 */ memset(myaddr, 0, 16); myaddr[12] = 127; myaddr[15] = 1; return 0; } /* * pthread_mutex with attribute of PTHREAD_PROCESS_SHARED could be * used by different threads in different processes. * We put pthread_mutex_t in shared-memory so any process could easily * get it. */ static struct sd_mutex *get_shared_lock(const char *path, int *fd) { struct sd_mutex *pmutex; pthread_mutexattr_t mutex_attr; int ret, flags = O_RDWR; bool created = false; ret = access(path, R_OK|W_OK); if (!ret) created = true; else if (errno != ENOENT) panic("failed to access %s, %m", path); if (!created) flags |= O_CREAT; *fd = open(path, flags, sd_def_fmode); if (*fd < 0) panic("failed to open %s, %m", path); if (!created) { ret = ftruncate(*fd, sizeof(pthread_mutex_t)); if (ret < 0) panic("failed to ftruncate %s, %m", path); } pmutex = (struct sd_mutex *)mmap(NULL, sizeof(struct sd_mutex), PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); if (!pmutex) panic("failed to mmap %s, %m", path); if (!created) { if (pthread_mutexattr_init(&mutex_attr)) panic("failed to init mutexattr, %m"); if (pthread_mutexattr_setpshared(&mutex_attr, PTHREAD_PROCESS_SHARED)) panic("failed to setpshared mutexattr, %m"); sd_init_mutex_attr(pmutex, &mutex_attr); } return pmutex; } static int local_init(const char *option) { sigset_t mask; int ret, fd; char path[PATH_MAX]; static struct timer t = { .callback = check_pids, .data = &t, }; if (option) shmfile = option; shm_queue_init(); sigemptyset(&mask); sigaddset(&mask, SIGUSR1); sigprocmask(SIG_BLOCK, &mask, NULL); sigfd = signalfd(-1, &mask, SFD_NONBLOCK); if (sigfd < 0) { sd_err("failed to create a signal fd: %m"); return -1; } add_timer(&t, PROCESS_CHECK_INTERVAL); ret = register_event(sigfd, local_handler, NULL); if (ret) { sd_err("failed to register local event handler (%d)", ret); return -1; } ret = xmkdir(lockdir, sd_def_dmode); if (ret < 0) { sd_err("failed to create lockdir %s, %m", lockdir); return -1; } ret = purge_directory(lockdir); if (ret < 0) { sd_err("failed to purge lockdir %s, %m", lockdir); return -1; } snprintf(path, sizeof(path), "%s%s", lockdir, "global_lock"); global_lock = get_shared_lock(path, &fd); sd_debug("create global_lock"); return 0; } static int lock_cmp(struct lock_entry *a, struct lock_entry *b) { return intcmp(a->lock_id, b->lock_id); } static struct lock_entry *lock_tree_lookup(uint64_t lock_id) { struct lock_entry entry = { .lock_id = lock_id, }; return rb_search(&lock_tree_root, &entry, rb, lock_cmp); } static struct lock_entry *lock_tree_add(struct lock_entry *new) { return rb_insert(&lock_tree_root, new, rb, lock_cmp); } static void local_lock(uint64_t lock_id) { struct lock_entry *entry; sd_mutex_lock(global_lock); entry = lock_tree_lookup(lock_id); if (!entry) { char path[PATH_MAX]; int fd; snprintf(path, sizeof(path), "%s%016"PRIx64, lockdir, lock_id); entry = xmalloc(sizeof(*entry)); entry->lock_id = lock_id; entry->mutex = get_shared_lock(path, &fd); entry->fd = fd; entry->ref = 0; lock_tree_add(entry); } entry->ref++; sd_mutex_unlock(global_lock); sd_mutex_lock(entry->mutex); } static void local_unlock(uint64_t lock_id) { struct lock_entry *entry; sd_mutex_lock(global_lock); entry = lock_tree_lookup(lock_id); if (!entry) panic("can't find fd for lock %"PRIx64, lock_id); sd_mutex_unlock(entry->mutex); entry->ref--; if (!entry->ref) { munmap(entry->mutex, sizeof(pthread_mutex_t)); close(entry->fd); rb_erase(&entry->rb, &lock_tree_root); free(entry); } sd_mutex_unlock(global_lock); } static int local_update_node(struct sd_node *node) { struct local_node lnode = this_node; lnode.node = *node; return add_event_lock(EVENT_UPDATE_NODE, &lnode, NULL, 0); } static struct cluster_driver cdrv_local = { .name = "local", .init = local_init, .get_local_addr = local_get_local_addr, .join = local_join, .leave = local_leave, .notify = local_notify, .block = local_block, .unblock = local_unblock, .lock = local_lock, .unlock = local_unlock, .update_node = local_update_node, }; cdrv_register(cdrv_local); sheepdog-0.8.3/sheep/cluster/shepherd.c000066400000000000000000000326251237656255000200760ustar00rootroot00000000000000/* * Copyright (C) 2013 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "shepherd.h" #include "internal_proto.h" #include "net.h" static int sph_comm_fd; static struct sd_node this_node; static int nr_nodes; static struct sd_node nodes[SD_MAX_NODES]; enum sph_driver_state { STATE_PRE_JOIN, STATE_JOINED, }; static enum sph_driver_state state = STATE_PRE_JOIN; static char *kept_opaque; static size_t kept_opaque_len; static int do_shepherd_join(void) { int ret, msg_join_len; struct sph_msg msg; struct sph_msg_join *msg_join; msg_join_len = sizeof(struct sph_msg_join) + kept_opaque_len; memset(&msg, 0, sizeof(msg)); msg.type = SPH_CLI_MSG_JOIN; msg.body_len = msg_join_len; msg_join = xzalloc(msg_join_len); msg_join->new_node = this_node; memcpy(msg_join->opaque, kept_opaque, kept_opaque_len); ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len); if (sizeof(msg) + msg_join_len != ret) { sd_err("do_shepherd_join() failed, %m"); free(msg_join); return -1; } free(msg_join); return 0; } static void read_msg(struct sph_msg *rcv) { int ret; ret = xread(sph_comm_fd, rcv, sizeof(*rcv)); if (ret != sizeof(*rcv)) { sd_err("xread() failed: %m"); exit(1); } } static void interpret_msg_pre_join(void) { int ret; struct sph_msg snd, rcv; struct sph_msg_join_reply *join_reply; retry: read_msg(&rcv); if (rcv.type == SPH_SRV_MSG_JOIN_RETRY) { sd_info("join request is rejected, retrying"); do_shepherd_join(); goto retry; } else if (rcv.type == SPH_SRV_MSG_NEW_NODE) { struct sph_msg_join *join; int join_len; join_len = rcv.body_len; join = xzalloc(join_len); ret = xread(sph_comm_fd, join, join_len); if (ret != join_len) { sd_err("xread() failed: %m"); exit(1); } /* * FIXME: member change events must be ordered with nonblocked * events */ if (!sd_join_handler(&join->new_node, NULL, 0, join->opaque)) panic("sd_accept_handler() failed"); snd.type = SPH_CLI_MSG_ACCEPT; snd.body_len = join_len; ret = writev2(sph_comm_fd, &snd, join, join_len); if (sizeof(snd) + join_len != ret) { sd_err("writev2() failed: %m"); exit(1); } free(join); read_msg(&rcv); } if (rcv.type != SPH_SRV_MSG_JOIN_REPLY) { sd_err("unexpected message from shepherd, received message: %s", sph_srv_msg_to_str(rcv.type)); /* * In this case, the state of this sheep in shepherd must be * SHEEP_STATE_CONNECTED. Messages other than SPH_MSG_JOIN_REPLY * mean bugs of shepherd. */ exit(1); } join_reply = xzalloc(rcv.body_len); ret = xread(sph_comm_fd, join_reply, rcv.body_len); if (ret != rcv.body_len) { sd_err("xread() failed: %m"); exit(1); } sd_info("join reply arrived, nr_nodes: %d", join_reply->nr_nodes); memcpy(nodes, join_reply->nodes, join_reply->nr_nodes * sizeof(struct sd_node)); nr_nodes = join_reply->nr_nodes; /* FIXME: member change events must be ordered with nonblocked events */ sd_accept_handler(&this_node, nodes, nr_nodes, join_reply->opaque); free(join_reply); sd_info("shepherd_join() succeed"); state = STATE_JOINED; } struct sph_event { struct sd_node sender; void *msg; int msg_len; bool callbacked, removed; struct list_node event_list; }; static LIST_HEAD(nonblocked_event_list); static LIST_HEAD(blocked_event_list); static int sph_event_fd; static bool sph_process_event(void) { struct sph_event *ev; bool nonblock; if (!list_empty(&nonblocked_event_list)) { ev = list_first_entry(&nonblocked_event_list, struct sph_event, event_list); nonblock = true; } else if (!list_empty(&blocked_event_list)) { ev = list_first_entry(&blocked_event_list, struct sph_event, event_list); nonblock = false; } else return false; if (ev->removed) goto remove; if (ev->callbacked) return false; if (nonblock) { sd_debug("processing nonblock event"); sd_notify_handler(&ev->sender, ev->msg, ev->msg_len); } else { sd_debug("processing block event"); ev->callbacked = sd_block_handler(&ev->sender); return false; } remove: list_del(&ev->event_list); free(ev->msg); free(ev); return true; } static void push_sph_event(bool nonblock, struct sd_node *sender, void *msg, int msg_len) { struct sph_event *ev; sd_debug("push_sph_event() called, pushing %sblocking event", nonblock ? "non" : ""); ev = xzalloc(sizeof(*ev)); ev->sender = *sender; if (msg_len) { ev->msg = xzalloc(msg_len); memcpy(ev->msg, msg, msg_len); ev->msg_len = msg_len; } ev->removed = false; ev->callbacked = false; if (nonblock) list_add_tail(&ev->event_list, &nonblocked_event_list); else list_add_tail(&ev->event_list, &blocked_event_list); eventfd_xwrite(sph_event_fd, 1); } static void remove_one_block_event(void) { struct sph_event *ev; bool removed = false; if (list_empty(&blocked_event_list)) /* FIXME: should I treat this case as an error? */ return; list_for_each_entry(ev, &blocked_event_list, event_list) { if (ev->removed) continue; removed = ev->removed = true; break; } if (!removed) panic("removed is not true"); eventfd_xwrite(sph_event_fd, 1); sd_debug("unblock a blocking event"); } static void sph_event_handler(int fd, int events, void *data) { eventfd_xread(fd); while (sph_process_event()) ; } static void msg_new_node(struct sph_msg *rcv) { int ret; struct sph_msg_join *join; struct sph_msg snd; join = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, join, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } /* FIXME: member change events must be ordered with nonblocked events */ if (!sd_join_handler(&join->new_node, join->nodes, join->nr_nodes, join->opaque)) /* * This should succeed always because shepherd should have sent * SPH_SRV_MSG_NEW_NODE only to the already joined node. */ panic("sd_join_handler() failed"); memset(&snd, 0, sizeof(snd)); snd.type = SPH_CLI_MSG_ACCEPT; snd.body_len = rcv->body_len; ret = writev2(sph_comm_fd, &snd, join, rcv->body_len); if (sizeof(snd) + rcv->body_len != ret) { sd_err("writev() failed: %m"); exit(1); } free(join); } static void msg_new_node_finish(struct sph_msg *rcv) { int ret; struct sph_msg_join_node_finish *join_node_finish; join_node_finish = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, join_node_finish, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } memcpy(nodes, join_node_finish->nodes, join_node_finish->nr_nodes * sizeof(struct sd_node)); nr_nodes = join_node_finish->nr_nodes; sd_info("new node: %s", node_to_str(&join_node_finish->new_node)); /* FIXME: member change events must be ordered with nonblocked events */ sd_accept_handler(&join_node_finish->new_node, nodes, nr_nodes, join_node_finish->opaque); free(join_node_finish); } static void msg_notify_forward(struct sph_msg *rcv) { int ret; struct sph_msg_notify_forward *notify_forward; notify_forward = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, notify_forward, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } if (notify_forward->unblock) remove_one_block_event(); push_sph_event(true, ¬ify_forward->from_node, notify_forward->notify_msg, rcv->body_len - sizeof(*notify_forward)); free(notify_forward); } static void msg_block_forward(struct sph_msg *rcv) { int ret; struct sd_node sender; ret = xread(sph_comm_fd, &sender, sizeof(sender)); if (ret != sizeof(sender)) { sd_err("xread() failed: %m"); exit(1); } push_sph_event(false, &sender, NULL, 0); } static void do_leave_sheep(void) { int ret; struct sd_node sender; ret = xread(sph_comm_fd, &sender, sizeof(sender)); if (ret != sizeof(sender)) { sd_err("xread() failed: %m"); exit(1); } sd_info("removing node: %s", node_to_str(&sender)); if (xlremove(&sender, nodes, &nr_nodes, node_cmp)) goto removed; sd_info("leave message from unknown node: %s", node_to_str(&sender)); return; removed: sd_debug("calling sd_leave_handler(), sender: %s", node_to_str(&sender)); /* FIXME: member change events must be ordered with nonblocked events */ sd_leave_handler(&sender, nodes, nr_nodes); } static void msg_remove(struct sph_msg *rcv) { sd_info("sudden leaving of sheep is caused"); do_leave_sheep(); } static void msg_leave_forward(struct sph_msg *rcv) { sd_info("intuitive leaving of sheep is caused"); do_leave_sheep(); } static void (*msg_handlers[])(struct sph_msg *) = { [SPH_SRV_MSG_NEW_NODE] = msg_new_node, [SPH_SRV_MSG_NEW_NODE_FINISH] = msg_new_node_finish, [SPH_SRV_MSG_NOTIFY_FORWARD] = msg_notify_forward, [SPH_SRV_MSG_BLOCK_FORWARD] = msg_block_forward, [SPH_SRV_MSG_REMOVE] = msg_remove, [SPH_SRV_MSG_LEAVE_FORWARD] = msg_leave_forward, }; static void interpret_msg(struct sph_msg *rcv) { if (!(0 <= rcv->type && rcv->type < ARRAY_SIZE(msg_handlers))) { sd_err("invalid message from shepherd: %s", sph_srv_msg_to_str(rcv->type)); exit(1); } msg_handlers[rcv->type](rcv); } static void read_msg_from_shepherd(void) { struct sph_msg rcv; switch (state) { case STATE_PRE_JOIN: interpret_msg_pre_join(); break; case STATE_JOINED: read_msg(&rcv); interpret_msg(&rcv); break; default: panic("invalid state of shepherd cluster driver: %d", state); break; }; } static void shepherd_comm_handler(int fd, int events, void *data) { assert(fd == sph_comm_fd); assert(data == NULL); if (events & EPOLLIN) read_msg_from_shepherd(); else if (events & EPOLLHUP || events & EPOLLERR) { sd_err("connection to shepherd caused an error: %m"); exit(1); } } static int shepherd_init(const char *option) { int ret, port; char *copied, *s_addr, *s_port, *saveptr; if (!option) { sd_err("shepherd cluster driver requires at least IP" " address of shepherd as an option"); exit(1); } copied = strdup(option); if (!copied) { sd_err("strdup() failed: %m"); exit(1); } s_addr = strtok_r(copied, ":", &saveptr); if (!s_addr) { sd_err("strdup() failed: %m"); exit(1); } s_port = strtok_r(NULL, ":", &saveptr); if (s_port) { char *p; port = strtol(s_port, &p, 10); if (*p != '\0') { sd_err("invalid option for host and port: %s", option); exit(1); } } else port = SHEPHERD_PORT; sph_comm_fd = connect_to(s_addr, port); if (sph_comm_fd == -1) { sd_err("cannot connect to shepherd," " is shepherd running? errno: %m"); return -1; } sph_event_fd = eventfd(0, EFD_NONBLOCK); ret = register_event(sph_event_fd, sph_event_handler, NULL); if (ret) { sd_err("register_event() failed: %m"); exit(1); } free(copied); return 0; } static int shepherd_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int ret; static bool registered; /* keep opaque for retrying */ kept_opaque = xzalloc(opaque_len); memcpy(kept_opaque, opaque, opaque_len); kept_opaque_len = opaque_len; this_node = *myself; sd_debug("shepherd_join() called, myself is %s", node_to_str(myself)); ret = do_shepherd_join(); if (!registered) { register_event(sph_comm_fd, shepherd_comm_handler, NULL); registered = true; } return ret; } static int shepherd_leave(void) { int ret; struct sph_msg msg; msg.type = SPH_CLI_MSG_LEAVE; msg.body_len = 0; ret = xwrite(sph_comm_fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { sd_info("xwrite() failed: %m"); exit(1); } sd_debug("shepherd_leave() is completed"); return 0; } static int do_shepherd_notify(bool unblock, void *msg, size_t msg_len) { int ret; struct sph_msg snd; struct sph_msg_notify *notify; snd.type = SPH_CLI_MSG_NOTIFY; snd.body_len = msg_len + sizeof(*notify); notify = xzalloc(snd.body_len); notify->unblock = unblock; memcpy(notify->notify_msg, msg, msg_len); ret = writev2(sph_comm_fd, &snd, notify, snd.body_len); if (sizeof(snd) + snd.body_len != ret) { sd_err("writev() failed: %m"); exit(1); } free(notify); sd_info("do_shepherd_notify() is completed"); return 0; } static int shepherd_notify(void *msg, size_t msg_len) { return do_shepherd_notify(false, msg, msg_len) == 0 ? SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR; } static int shepherd_block(void) { int ret; struct sph_msg msg; msg.type = SPH_CLI_MSG_BLOCK; msg.body_len = 0; ret = xwrite(sph_comm_fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { sd_err("xwrite() failed: %m"); exit(1); } return SD_RES_SUCCESS; } static int shepherd_unblock(void *msg, size_t msg_len) { return do_shepherd_notify(true, msg, msg_len) == 0 ? SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR; } /* FIXME: shepherd server also has to udpate node information */ static int shepherd_update_node(struct sd_node *node) { return SD_RES_NO_SUPPORT; } static struct cluster_driver cdrv_shepherd = { .name = "shepherd", .init = shepherd_init, .join = shepherd_join, .leave = shepherd_leave, .notify = shepherd_notify, .block = shepherd_block, .unblock = shepherd_unblock, .update_node = shepherd_update_node, .get_local_addr = get_local_addr, }; cdrv_register(cdrv_shepherd); sheepdog-0.8.3/sheep/cluster/zookeeper.c000066400000000000000000001060421237656255000202720ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * Copyright (C) 2012 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include "cluster.h" #include "config.h" #include "event.h" #include "work.h" #include "util.h" #include "rbtree.h" #define SESSION_TIMEOUT 30000 /* millisecond */ #define BASE_ZNODE "/sheepdog" #define QUEUE_ZNODE BASE_ZNODE "/queue" #define MEMBER_ZNODE BASE_ZNODE "/member" #define MASTER_ZNODE BASE_ZNODE "/master" #define LOCK_ZNODE BASE_ZNODE "/lock" static int zk_timeout = SESSION_TIMEOUT; static int my_master_seq; /* structure for distributed lock */ struct cluster_lock { struct hlist_node hnode; /* id is passed by users to represent a lock handle */ uint64_t id; /* referenced by different threads in one sheepdog daemon */ uint64_t ref; /* wait for the release of id by other lock owner */ sem_t wait_wakeup; /* lock for different threads of the same node on the same id */ struct sd_mutex id_lock; char lock_path[MAX_NODE_STR_LEN]; }; #define WAIT_TIME 1 /* second */ #define HASH_BUCKET_NR 1021 static struct hlist_head *cluster_locks_table; static struct sd_mutex table_locks[HASH_BUCKET_NR]; /* * Wait a while when create, delete or get_children fail on * zookeeper lock so it will not print too much loop log */ static void zk_wait(void) { sleep(WAIT_TIME); } /* iterate child znodes */ #define FOR_EACH_ZNODE(parent, path, strs) \ for ((strs)->data += (strs)->count; \ (strs)->count-- ? \ snprintf(path, sizeof(path), "%s/%s", parent, \ *--(strs)->data) : (free((strs)->data), 0); \ free(*(strs)->data)) enum zk_event_type { EVENT_JOIN = 1, EVENT_ACCEPT, EVENT_LEAVE, EVENT_BLOCK, EVENT_UNBLOCK, EVENT_NOTIFY, EVENT_UPDATE_NODE, }; struct zk_node { struct list_node list; struct rb_node rb; struct sd_node node; bool callbacked; bool gone; }; #define ZK_MAX_BUF_SIZE (1*1024*1024) /* 1M */ struct zk_event { uint64_t id; enum zk_event_type type; struct zk_node sender; size_t msg_len; size_t nr_nodes; size_t buf_len; uint8_t buf[ZK_MAX_BUF_SIZE]; }; static struct rb_root sd_node_root = RB_ROOT; static size_t nr_sd_nodes; static struct rb_root zk_node_root = RB_ROOT; static struct sd_rw_lock zk_tree_lock = SD_RW_LOCK_INITIALIZER; static struct sd_rw_lock zk_compete_master_lock = SD_RW_LOCK_INITIALIZER; static LIST_HEAD(zk_block_list); static uatomic_bool is_master; static uatomic_bool stop; static bool joined; static bool first_push = true; static void zk_compete_master(void); static int zk_node_cmp(const struct zk_node *a, const struct zk_node *b) { return node_id_cmp(&a->node.nid, &b->node.nid); } static struct zk_node *zk_tree_insert(struct zk_node *new) { return rb_insert(&zk_node_root, new, rb, zk_node_cmp); } static struct zk_node *zk_tree_search_nolock(const struct node_id *nid) { struct zk_node key = { .node.nid = *nid }; return rb_search(&zk_node_root, &key, rb, zk_node_cmp); } static inline struct zk_node *zk_tree_search(const struct node_id *nid) { struct zk_node *n; sd_read_lock(&zk_tree_lock); n = zk_tree_search_nolock(nid); sd_rw_unlock(&zk_tree_lock); return n; } /* zookeeper API wrapper */ static zhandle_t *zhandle; static struct zk_node this_node; #define CHECK_ZK_RC(rc, path) \ switch (rc) { \ case ZNONODE: \ case ZNODEEXISTS: \ case ZNOTEMPTY: \ break; \ case ZINVALIDSTATE: \ case ZSESSIONEXPIRED: \ case ZOPERATIONTIMEOUT: \ case ZCONNECTIONLOSS: \ sd_err("failed, path:%s, %s", path, zerror(rc)); \ case ZOK: \ break; \ case ZNOCHILDRENFOREPHEMERALS: \ /* \ * Because code has guaranteed that parent nodes are \ * always non-ephemeral, this could happen only when \ * sheep joins a cluster in an incompatible version. \ */ \ sd_err("incompatible version of sheep %s", \ PACKAGE_VERSION); \ default: \ panic("failed, path:%s, %s", path, zerror(rc)); \ } #define RETURN_IF_ERROR(stmt, fmt, ...) \ do { \ int __rc = stmt; \ if (__rc != ZOK) { \ sd_err("failed, " fmt ", %s", \ ##__VA_ARGS__, zerror(__rc)); \ return __rc; \ } \ } while (0) #define RETURN_VOID_IF_ERROR(stmt, fmt, ...) \ do { \ int __rc = stmt; \ if (__rc != ZOK) { \ sd_err("failed, " fmt ", %s", \ ##__VA_ARGS__, zerror(__rc)); \ return; \ } \ } while (0) static inline ZOOAPI int zk_delete_node(const char *path, int version) { int rc; do { rc = zoo_delete(zhandle, path, version); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_init_node(const char *path) { int rc; do { rc = zoo_create(zhandle, path, "", 0, &ZOO_OPEN_ACL_UNSAFE, 0, NULL, 0); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); if (rc == ZNODEEXISTS) rc = ZOK; return rc; } static inline ZOOAPI int zk_create_node(const char *path, const char *value, int valuelen, const struct ACL_vector *acl, int flags, char *path_buffer, int path_buffer_len) { int rc; do { rc = zoo_create(zhandle, path, value, valuelen, acl, flags, path_buffer, path_buffer_len); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } /* * Create a znode after adding a unique monotonically increasing sequence number * to the path name. * * Note that the caller has to retry this function when this returns * ZOPERATIONTIMEOUT or ZCONNECTIONLOSS and the znode is not created. */ static inline ZOOAPI int zk_create_seq_node(const char *path, const char *value, int valuelen, char *path_buffer, int path_buffer_len, bool ephemeral) { int rc; int flags = ZOO_SEQUENCE; if (ephemeral) flags = flags | ZOO_EPHEMERAL; rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE, flags, path_buffer, path_buffer_len); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_get_data(const char *path, void *buffer, int *buffer_len) { int rc; do { rc = zoo_get(zhandle, path, 1, (char *)buffer, buffer_len, NULL); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_set_data(const char *path, const char *buffer, int buflen, int version) { int rc; do { rc = zoo_set(zhandle, path, buffer, buflen, version); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_node_exists(const char *path) { int rc; do { rc = zoo_exists(zhandle, path, 1, NULL); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_get_children(const char *path, struct String_vector *strings) { int rc; do { rc = zoo_get_children(zhandle, path, 1, strings); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } /* * All the operations of the lock table is protected by * cluster_lock->id_lock so we don't need to add lock here */ static int lock_table_lookup_wakeup(uint64_t lock_id) { uint64_t hval = sd_hash_64(lock_id) % HASH_BUCKET_NR; int res = -1; struct hlist_node *iter; struct cluster_lock *lock; sd_mutex_lock(table_locks + hval); hlist_for_each_entry(lock, iter, cluster_locks_table + hval, hnode) { if (lock->id == lock_id) { sem_post(&lock->wait_wakeup); res = 0; break; } } sd_mutex_unlock(table_locks + hval); return res; } static struct cluster_lock *lock_table_lookup_acquire(uint64_t lock_id) { uint64_t hval = sd_hash_64(lock_id) % HASH_BUCKET_NR; int rc; struct hlist_node *iter; struct cluster_lock *lock, *ret_lock = NULL; char path[MAX_NODE_STR_LEN]; sd_mutex_lock(table_locks + hval); hlist_for_each_entry(lock, iter, cluster_locks_table + hval, hnode) { if (lock->id == lock_id) { ret_lock = lock; ret_lock->ref++; break; } } if (!ret_lock) { /* create lock and add it to hash table */ ret_lock = xzalloc(sizeof(*ret_lock)); ret_lock->id = lock_id; ret_lock->ref = 1; snprintf(path, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64, ret_lock->id); rc = zk_init_node(path); if (rc) panic("Failed to init node %s", path); sem_init(&ret_lock->wait_wakeup, 0, 1); sd_init_mutex(&ret_lock->id_lock); hlist_add_head(&(ret_lock->hnode), cluster_locks_table + hval); } sd_mutex_unlock(table_locks + hval); /* * if many threads use locks with same id, we should use * ->id_lock to avoid the only zookeeper handler to * create many seq-ephemeral files. */ sd_mutex_lock(&ret_lock->id_lock); return ret_lock; } static void lock_table_lookup_release(uint64_t lock_id) { uint64_t hval = sd_hash_64(lock_id) % HASH_BUCKET_NR; int rc; struct hlist_node *iter; struct cluster_lock *lock; char path[MAX_NODE_STR_LEN]; sd_mutex_lock(table_locks + hval); hlist_for_each_entry(lock, iter, cluster_locks_table + hval, hnode) { if (lock->id != lock_id) continue; while (true) { rc = zk_delete_node(lock->lock_path, -1); if (rc == ZOK || rc == ZNONODE) { sd_debug("delete path: %s ok", lock->lock_path); break; } sd_err("Failed to delete path: %s %s", lock->lock_path, zerror(rc)); zk_wait(); } lock->lock_path[0] = '\0'; sd_mutex_unlock(&lock->id_lock); lock->ref--; if (!lock->ref) { hlist_del(iter); /* free all resource used by this lock */ sd_destroy_mutex(&lock->id_lock); sem_destroy(&lock->wait_wakeup); snprintf(path, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64, lock->id); /* * If deletion of directory 'lock_id' fail, we only get * a * empty directory in zookeeper. That's unharmful * so we don't need to retry it. */ rc = zk_delete_node(path, -1); if (rc != ZOK) sd_err("Failed to delete path: %s %s", path, zerror(rc)); free(lock); } break; } sd_mutex_unlock(table_locks + hval); } /* * If this node leave the cluster, we need to delete the znode which created * for distributed lock. Otherwise, the lock will never be released. */ static void lock_table_remove_znodes(void) { uint64_t hval; int rc; struct hlist_node *iter; struct cluster_lock *lock; for (hval = 0; hval < HASH_BUCKET_NR; hval++) { sd_mutex_lock(table_locks + hval); hlist_for_each_entry(lock, iter, cluster_locks_table + hval, hnode) { while (true) { rc = zk_delete_node(lock->lock_path, -1); if (rc == ZOK || rc == ZNONODE) { sd_debug("delete path: %s ok", lock->lock_path); break; } sd_err("Failed to delete path: %s %s", lock->lock_path, zerror(rc)); zk_wait(); } } sd_mutex_unlock(table_locks + hval); } } /* ZooKeeper-based queue give us an totally ordered events */ static int efd; static int32_t queue_pos; static int zk_queue_peek(bool *peek) { int rc; char path[MAX_NODE_STR_LEN]; snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); rc = zk_node_exists(path); switch (rc) { case ZOK: *peek = true; return ZOK; case ZNONODE: *peek = false; return ZOK; default: sd_err("failed, %s", zerror(rc)); return rc; } } /* return true if there is a node with 'id' in the queue. */ static int zk_find_seq_node(uint64_t id, char *seq_path, int seq_path_len, bool *found) { int rc, len; for (int seq = queue_pos; ; seq++) { struct zk_event ev; snprintf(seq_path, seq_path_len, QUEUE_ZNODE"/%010"PRId32, seq); len = offsetof(typeof(ev), id) + sizeof(ev.id); rc = zk_get_data(seq_path, &ev, &len); switch (rc) { case ZOK: if (ev.id == id) { sd_debug("id %" PRIx64 " is found in %s", id, seq_path); *found = true; return ZOK; } break; case ZNONODE: sd_debug("id %"PRIx64" is not found", id); *found = false; return ZOK; default: sd_err("failed, %s", zerror(rc)); return rc; } } } static int zk_queue_push(struct zk_event *ev) { int rc, len; char path[MAX_NODE_STR_LEN], buf[MAX_NODE_STR_LEN]; bool found; len = offsetof(typeof(*ev), buf) + ev->buf_len; snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE); again: rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf), false); switch (rc) { case ZOK: /* Success */ break; case ZOPERATIONTIMEOUT: case ZCONNECTIONLOSS: if (zk_find_seq_node(ev->id, buf, sizeof(buf), &found) == ZOK) { if (found) break; else /* retry if seq_node was not created */ goto again; } /* fall through */ default: sd_err("failed, path:%s, %s", path, zerror(rc)); return rc; } if (first_push) { int32_t seq; sscanf(buf, QUEUE_ZNODE "/%"PRId32, &seq); queue_pos = seq; eventfd_xwrite(efd, 1); first_push = false; } sd_debug("create path:%s, queue_pos:%010" PRId32 ", len:%d", buf, queue_pos, len); return ZOK; } static inline void *zk_event_sd_nodes(struct zk_event *ev) { return (char *)ev->buf + ev->msg_len; } /* Change the join event in place and piggyback the nodes information. */ static int push_join_response(struct zk_event *ev) { char path[MAX_NODE_STR_LEN]; struct sd_node *n, *np = zk_event_sd_nodes(ev); int len; ev->type = EVENT_ACCEPT; ev->nr_nodes = nr_sd_nodes; rb_for_each_entry(n, &sd_node_root, rb) { memcpy(np++, n, sizeof(struct sd_node)); } queue_pos--; len = offsetof(typeof(*ev), buf) + ev->buf_len; snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); RETURN_IF_ERROR(zk_set_data(path, (char *)ev, len, -1), ""); sd_debug("update path:%s, queue_pos:%010" PRId32 ", len:%d", path, queue_pos, len); return ZOK; } static int zk_queue_pop_advance(struct zk_event *ev) { int len; char path[MAX_NODE_STR_LEN]; len = sizeof(*ev); snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); RETURN_IF_ERROR(zk_get_data(path, ev, &len), "path %s", path); sd_debug("%s, type:%d, len:%d, pos:%" PRId32, path, ev->type, len, queue_pos); queue_pos++; return ZOK; } static inline void zk_tree_add(struct zk_node *node) { struct zk_node *zk = xzalloc(sizeof(*zk)); *zk = *node; sd_write_lock(&zk_tree_lock); if (zk_tree_insert(zk)) { free(zk); goto out; } /* * Even node list will be built later, we need this because in master * transfer case, we need this information to destroy the tree. */ rb_insert(&sd_node_root, &zk->node, rb, node_cmp); nr_sd_nodes++; out: sd_rw_unlock(&zk_tree_lock); } static inline void zk_tree_del(struct zk_node *node) { sd_write_lock(&zk_tree_lock); rb_erase(&node->rb, &zk_node_root); free(node); sd_rw_unlock(&zk_tree_lock); } static inline void zk_tree_destroy(void) { sd_write_lock(&zk_tree_lock); rb_destroy(&zk_node_root, struct zk_node, rb); sd_rw_unlock(&zk_tree_lock); } static inline void build_node_list(void) { struct zk_node *zk; nr_sd_nodes = 0; INIT_RB_ROOT(&sd_node_root); rb_for_each_entry(zk, &zk_node_root, rb) { rb_insert(&sd_node_root, &zk->node, rb, node_cmp); nr_sd_nodes++; } sd_debug("nr_sd_nodes:%zu", nr_sd_nodes); } static int zk_queue_init(void) { RETURN_IF_ERROR(zk_init_node(BASE_ZNODE), "path %s", BASE_ZNODE); RETURN_IF_ERROR(zk_init_node(MASTER_ZNODE), "path %s", MASTER_ZNODE); RETURN_IF_ERROR(zk_init_node(QUEUE_ZNODE), "path %s", QUEUE_ZNODE); RETURN_IF_ERROR(zk_init_node(MEMBER_ZNODE), "path %s", MEMBER_ZNODE); return ZOK; } /* Calculate a unique 64 bit integer from this_node and the sequence number. */ static uint64_t get_uniq_id(void) { static int seq; struct { uint64_t n; struct zk_node node; } id = { .n = uatomic_add_return(&seq, 1), .node = this_node, }; return sd_hash(&id, sizeof(id)); } static int add_event(enum zk_event_type type, struct zk_node *znode, void *buf, size_t buf_len) { struct zk_event ev; int rc; memset(&ev, 0, sizeof(ev)); ev.id = get_uniq_id(); ev.type = type; ev.sender = *znode; ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); rc = zk_queue_push(&ev); if (rc == ZOK) return SD_RES_SUCCESS; else { sd_err("failed, type: %d, %s", type, zerror(rc)); return SD_RES_CLUSTER_ERROR; } } static void zk_watcher(zhandle_t *zh, int type, int state, const char *path, void *ctx) { struct zk_node znode; char str[MAX_NODE_STR_LEN], *p; uint64_t lock_id; int ret; if (type == ZOO_SESSION_EVENT && state == ZOO_EXPIRED_SESSION_STATE) { /* * do reconnect in main thread to avoid on-the-fly zookeeper * operations. */ eventfd_xwrite(efd, 1); return; } /* CREATED_EVENT 1, DELETED_EVENT 2, CHANGED_EVENT 3, CHILD_EVENT 4 */ sd_debug("path:%s, type:%d, state:%d", path, type, state); if (type == ZOO_CREATED_EVENT || type == ZOO_CHANGED_EVENT) { ret = sscanf(path, MEMBER_ZNODE "/%s", str); if (ret == 1) zk_node_exists(path); /* kick off the event handler */ eventfd_xwrite(efd, 1); } else if (type == ZOO_DELETED_EVENT) { struct zk_node *n; /* process distributed lock */ ret = sscanf(path, LOCK_ZNODE "/%"PRIu64"/%s", &lock_id, str); if (ret == 2) { ret = lock_table_lookup_wakeup(lock_id); if (ret) sd_debug("release lock %"PRIu64" %s", lock_id, str); return; } ret = sscanf(path, MASTER_ZNODE "/%s", str); if (ret == 1) { zk_compete_master(); return; } ret = sscanf(path, MEMBER_ZNODE "/%s", str); if (ret != 1) return; p = strrchr(path, '/'); p++; str_to_node(p, &znode.node); /* FIXME: remove redundant leave events */ sd_read_lock(&zk_tree_lock); n = zk_tree_search_nolock(&znode.node.nid); if (n) n->gone = true; sd_rw_unlock(&zk_tree_lock); if (n) add_event(EVENT_LEAVE, &znode, NULL, 0); } } /* * We placehold the enough space to piggyback the nodes information on join * response message so that every node can see the same membership view. * * We have to preallocate enough space and set msg_len as * sizeof(struct cluster_info) because of piggyback. */ static int add_join_event(void *msg, size_t msglen) { struct zk_event ev; size_t len = msglen + sizeof(struct sd_node) * SD_MAX_NODES; if (unlikely((offsetof(struct zk_event, buf) + len) > ZK_MAX_BUF_SIZE)) panic("Zookeeper can't send message more than 1M"); ev.id = get_uniq_id(); ev.type = EVENT_JOIN; ev.sender = this_node; ev.msg_len = msglen; ev.buf_len = len; if (msg) memcpy(ev.buf, msg, msglen); return zk_queue_push(&ev); } static int zk_get_least_seq(const char *parent, char *least_seq_path, int path_len, void *buf, int *buf_len) { char path[MAX_NODE_STR_LEN], *p, *tmp; struct String_vector strs; int rc, least_seq, seq; while (true) { /* * If first loop fail, the least_seq may be a very small number * which had been deleted in zookeeper, the new create file will * be all larger than it and it will cause dead loop. * Therefore we need to set least_seq to INT_MAX in every loop. */ least_seq = INT_MAX; RETURN_IF_ERROR(zk_get_children(parent, &strs), ""); FOR_EACH_ZNODE(parent, path, &strs) { p = strrchr(path, '/'); seq = strtol(++p, &tmp, 10); if (seq < least_seq) least_seq = seq; } snprintf(path, MAX_NODE_STR_LEN, "%s/%010"PRId32, parent, least_seq); rc = zk_get_data(path, buf, buf_len); switch (rc) { case ZOK: strncpy(least_seq_path, path, path_len); return ZOK; case ZNONODE: break; default: sd_err("failed, %s", zerror(rc)); return rc; } } } static int zk_find_master(int *master_seq, char *master_name) { int rc, len = MAX_NODE_STR_LEN; char master_compete_path[MAX_NODE_STR_LEN]; if (*master_seq < 0) { RETURN_IF_ERROR(zk_get_least_seq(MASTER_ZNODE, master_compete_path, MAX_NODE_STR_LEN, master_name, &len), ""); sscanf(master_compete_path, MASTER_ZNODE "/%"PRId32, master_seq); return ZOK; } else { while (true) { snprintf(master_compete_path, len, MASTER_ZNODE "/%010"PRId32, *master_seq); rc = zk_get_data(master_compete_path, master_name, &len); switch (rc) { case ZOK: return ZOK; case ZNONODE: sd_info("detect master leave, " "start to compete master"); (*master_seq)++; break; default: sd_err("failed, %s", zerror(rc)); return rc; } } } } /* * block until last sheep joined * last_sheep returns sequence number of last sheep or -1 if no previous sheep */ static int zk_verify_last_sheep_join(int seq, int *last_sheep) { int rc, len = MAX_NODE_STR_LEN; char path[MAX_NODE_STR_LEN], name[MAX_NODE_STR_LEN]; for (*last_sheep = seq - 1; *last_sheep >= 0; (*last_sheep)--) { snprintf(path, MAX_NODE_STR_LEN, MASTER_ZNODE "/%010"PRId32, *last_sheep); rc = zk_get_data(path, name, &len); switch (rc) { case ZNONODE: continue; case ZOK: break; default: sd_err("failed, %s", zerror(rc)); return rc; } if (!strcmp(name, node_to_str(&this_node.node))) continue; snprintf(path, MAX_NODE_STR_LEN, MEMBER_ZNODE "/%s", name); rc = zk_node_exists(path); switch (rc) { case ZOK: return ZOK; case ZNONODE: (*last_sheep)++; break; default: sd_err("failed, %s", zerror(rc)); return rc; } } return ZOK; } /* * Create sequential node under MASTER_ZNODE. * Sheep with least sequential number win the competition. */ static void zk_compete_master(void) { int rc, last_joined_sheep; char master_name[MAX_NODE_STR_LEN]; char my_compete_path[MAX_NODE_STR_LEN]; static int master_seq = -1, my_seq; /* * This is to protect master_seq and my_seq because this function will * be called by both main thread and zookeeper's event thread. */ sd_write_lock(&zk_compete_master_lock); if (uatomic_is_true(&is_master) || uatomic_is_true(&stop)) goto out_unlock; if (!joined) { sd_debug("start to compete master for the first time"); do { if (uatomic_is_true(&stop)) goto out_unlock; /* duplicate sequential node has no side-effect */ rc = zk_create_seq_node(MASTER_ZNODE "/", node_to_str(&this_node.node), MAX_NODE_STR_LEN, my_compete_path, MAX_NODE_STR_LEN, true); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, MASTER_ZNODE "/"); if (rc != ZOK) goto out_unlock; sd_debug("my compete path: %s", my_compete_path); sscanf(my_compete_path, MASTER_ZNODE "/%"PRId32, &my_seq); } if (zk_find_master(&master_seq, master_name) != ZOK) goto out_unlock; if (!strcmp(master_name, node_to_str(&this_node.node))) goto success; else if (joined) { sd_debug("lost"); goto out_unlock; } else { if (zk_verify_last_sheep_join(my_seq, &last_joined_sheep) != ZOK) goto out_unlock; if (last_joined_sheep < 0) { /* all previous sheep has quit, i'm master */ master_seq = my_seq; goto success; } else { sd_debug("lost"); goto out_unlock; } } success: uatomic_set_true(&is_master); my_master_seq = master_seq; sd_debug("success"); out_unlock: sd_rw_unlock(&zk_compete_master_lock); } static int zk_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int rc; char path[MAX_NODE_STR_LEN]; this_node.node = *myself; snprintf(path, sizeof(path), MEMBER_ZNODE "/%s", node_to_str(myself)); rc = zk_node_exists(path); if (rc == ZOK) { sd_err("Previous zookeeper session exist, shoot myself. Please " "wait for %d seconds to join me again.", DIV_ROUND_UP(zk_timeout, 1000)); exit(1); } zk_compete_master(); RETURN_IF_ERROR(add_join_event(opaque, opaque_len), ""); return ZOK; } static int zk_leave(void) { char path[PATH_MAX]; sd_info("leaving from cluster"); uatomic_set_true(&stop); if (uatomic_is_true(&is_master)) { snprintf(path, sizeof(path), MASTER_ZNODE "/%010"PRId32, my_master_seq); zk_delete_node(path, -1); } snprintf(path, sizeof(path), MEMBER_ZNODE"/%s", node_to_str(&this_node.node)); add_event(EVENT_LEAVE, &this_node, NULL, 0); lock_table_remove_znodes(); zk_delete_node(path, -1); return 0; } static int zk_notify(void *msg, size_t msg_len) { return add_event(EVENT_NOTIFY, &this_node, msg, msg_len); } static int zk_block(void) { return add_event(EVENT_BLOCK, &this_node, NULL, 0); } static int zk_unblock(void *msg, size_t msg_len) { return add_event(EVENT_UNBLOCK, &this_node, msg, msg_len); } static void zk_handle_join(struct zk_event *ev) { sd_debug("sender: %s", node_to_str(&ev->sender.node)); if (!uatomic_is_true(&is_master)) { /* Let's await master acking the join-request */ queue_pos--; return; } sd_join_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes, ev->buf); push_join_response(ev); sd_debug("I'm the master now"); } static void watch_all_nodes(void) { struct String_vector strs; char path[MAX_NODE_STR_LEN]; RETURN_VOID_IF_ERROR(zk_get_children(MEMBER_ZNODE, &strs), ""); FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) { RETURN_VOID_IF_ERROR(zk_node_exists(path), ""); } } static void init_node_list(struct zk_event *ev) { uint8_t *p = zk_event_sd_nodes(ev); size_t node_nr = ev->nr_nodes; int i; sd_debug("%zu", node_nr); for (i = 0; i < node_nr; i++) { struct zk_node zk; mempcpy(&zk.node, p, sizeof(struct sd_node)); zk_tree_add(&zk); p += sizeof(struct sd_node); } watch_all_nodes(); } static void zk_handle_accept(struct zk_event *ev) { char path[MAX_NODE_STR_LEN]; int rc; sd_debug("ACCEPT"); if (node_eq(&ev->sender.node, &this_node.node)) /* newly joined node */ init_node_list(ev); sd_debug("%s", node_to_str(&ev->sender.node)); snprintf(path, sizeof(path), MEMBER_ZNODE"/%s", node_to_str(&ev->sender.node)); if (node_eq(&ev->sender.node, &this_node.node)) { joined = true; sd_debug("create path:%s", path); rc = zk_create_node(path, (char *)zoo_client_id(zhandle), sizeof(clientid_t), &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, NULL, 0); RETURN_VOID_IF_ERROR(rc, ""); } else zk_node_exists(path); zk_tree_add(&ev->sender); build_node_list(); sd_accept_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes, ev->buf); } static void kick_block_event(void) { struct zk_node *block; if (list_empty(&zk_block_list)) return; block = list_first_entry(&zk_block_list, typeof(*block), list); if (!block->callbacked) block->callbacked = sd_block_handler(&block->node); } static void block_event_list_del(struct zk_node *n) { struct zk_node *ev; list_for_each_entry(ev, &zk_block_list, list) { if (node_eq(&ev->node, &n->node)) { list_del(&ev->list); free(ev); } } } static void zk_handle_leave(struct zk_event *ev) { struct zk_node *n = zk_tree_search(&ev->sender.node.nid); if (!n) { sd_debug("can't find this leave node:%s, ignore it.", node_to_str(&ev->sender.node)); return; } block_event_list_del(n); zk_tree_del(n); build_node_list(); sd_leave_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes); } static void zk_handle_block(struct zk_event *ev) { struct zk_node *block = xzalloc(sizeof(*block)); sd_debug("BLOCK"); block->node = ev->sender.node; list_add_tail(&block->list, &zk_block_list); block = list_first_entry(&zk_block_list, typeof(*block), list); if (!block->callbacked) block->callbacked = sd_block_handler(&block->node); } static void zk_handle_unblock(struct zk_event *ev) { struct zk_node *block; sd_debug("UNBLOCK"); if (list_empty(&zk_block_list)) return; block = list_first_entry(&zk_block_list, typeof(*block), list); sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); list_del(&block->list); free(block); } static void zk_handle_notify(struct zk_event *ev) { sd_debug("NOTIFY"); sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); } static void zk_handle_update_node(struct zk_event *ev) { struct zk_node *t; struct sd_node *snode = &ev->sender.node; sd_debug("%s", node_to_str(snode)); if (node_eq(snode, &this_node.node)) this_node.node = *snode; sd_read_lock(&zk_tree_lock); t = zk_tree_search_nolock(&snode->nid); assert(t); t->node = *snode; build_node_list(); sd_rw_unlock(&zk_tree_lock); sd_update_node_handler(snode); } static void (*const zk_event_handlers[])(struct zk_event *ev) = { [EVENT_JOIN] = zk_handle_join, [EVENT_ACCEPT] = zk_handle_accept, [EVENT_LEAVE] = zk_handle_leave, [EVENT_BLOCK] = zk_handle_block, [EVENT_UNBLOCK] = zk_handle_unblock, [EVENT_NOTIFY] = zk_handle_notify, [EVENT_UPDATE_NODE] = zk_handle_update_node, }; static const int zk_max_event_handlers = ARRAY_SIZE(zk_event_handlers); /* * This method should be done in main thread and triggered when zk_watcher() * receives a session timeout event. * All other zk operations who receive 'ZINVALIDSTATE' return code should drop * control of main thread as soon as possible. So that this method can be * executed and re-establish a new session with zookeeper server. */ static inline void handle_session_expire(void) { /* clean memory states */ close(efd); zk_tree_destroy(); INIT_RB_ROOT(&zk_node_root); INIT_LIST_HEAD(&zk_block_list); nr_sd_nodes = 0; INIT_RB_ROOT(&sd_node_root); first_push = true; joined = false; while (sd_reconnect_handler()) { sd_err("failed to reconnect. sleep and retry..."); sleep(1); } } static void zk_event_handler(int listen_fd, int events, void *data) { struct zk_event ev; bool peek; sd_debug("%d, %d", events, queue_pos); if (events & EPOLLHUP) { sd_err("zookeeper driver received EPOLLHUP event, exiting."); log_close(); exit(1); } eventfd_xread(efd); if (zoo_state(zhandle) == ZOO_EXPIRED_SESSION_STATE) { sd_err("detect a session timeout. reconnecting..."); handle_session_expire(); sd_info("reconnected"); eventfd_xwrite(efd, 1); return; } RETURN_VOID_IF_ERROR(zk_queue_peek(&peek), ""); if (!peek) goto kick_block_event; RETURN_VOID_IF_ERROR(zk_queue_pop_advance(&ev), ""); if (ev.type < zk_max_event_handlers && zk_event_handlers[ev.type]) zk_event_handlers[ev.type](&ev); else panic("unhandled type %d", ev.type); RETURN_VOID_IF_ERROR(zk_queue_peek(&peek), ""); if (peek) { /* Someone has created next event, go kick event handler. */ eventfd_xwrite(efd, 1); return; } kick_block_event: /* * Kick block event only if there is no nonblock event. We perfer to * handle nonblock event becasue: * * 1. Sheep assuems that unblock() and notify() is a transaction, so we * can only kick next block event after sd_notify_handler() is called * 2. We should process leave/join event as soon as possible. */ kick_block_event(); } /* * This operation will create a seq-ephemeral znode in lock directory * of zookeeper (use lock-id as dir name). The smallest file path in * this directory wil be the owner of the lock; the other threads will * wait on a sem_t (cluster_lock->wait_wakeup) */ static void zk_lock(uint64_t lock_id) { int rc, len = MAX_NODE_STR_LEN; char *my_path; char parent[MAX_NODE_STR_LEN]; char parent_node[MAX_NODE_STR_LEN]; char lowest_seq_path[MAX_NODE_STR_LEN]; char owner_name[MAX_NODE_STR_LEN]; struct cluster_lock *cluster_lock; cluster_lock = lock_table_lookup_acquire(lock_id); my_path = cluster_lock->lock_path; snprintf(parent, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64"/", cluster_lock->id); /* * It need using path without end of '/' to create node of lock_id in * zookeeper's API, so we use 'parent_node'. */ snprintf(parent_node, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64, cluster_lock->id); create_seq_node: /* compete owner of lock is just like zk_compete_master() */ while (true) { rc = zk_create_seq_node(parent, node_to_str(&this_node.node), MAX_NODE_STR_LEN, my_path, MAX_NODE_STR_LEN, true); if (rc == ZOK) break; if (rc == ZNONODE) { zk_init_node(parent_node); /* * We don't need to check the return code of * zk_init_node() because the routine must stay in loop * if it doesn't take the lock, no matter what kind of * error happed. */ continue; } sd_err("failed to create path:%s, %s", my_path, zerror(rc)); zk_wait(); } sd_debug("create path %s success", my_path); /* create node ok now */ while (true) { rc = zk_get_least_seq(parent_node, lowest_seq_path, MAX_NODE_STR_LEN, owner_name, &len); /* may be expired */ if (rc == ZNONODE) { sd_debug("Recreate seq node"); goto create_seq_node; } /* I got the lock */ if (!strncmp(lowest_seq_path, my_path, strlen(my_path))) { sd_debug("I am master now. %s", lowest_seq_path); return; } /* I failed to get the lock */ rc = zk_node_exists(lowest_seq_path); if (rc == ZOK) { sd_debug("call zoo_exists success %s", lowest_seq_path); /* Use wait_timeout to avoid missing wakeup signal */ sem_wait(&cluster_lock->wait_wakeup); } else { sd_debug("failed to call zoo_exists %s", zerror(rc)); if (rc != ZNONODE) zk_wait(); } } } static void zk_unlock(uint64_t lock_id) { lock_table_lookup_release(lock_id); sd_debug("unlock %"PRIu64, lock_id); } static int zk_init(const char *option) { char *hosts, *to, *p; int ret, interval, retry = 0, max_retry; if (!option) { sd_err("You must specify zookeeper servers."); return -1; } hosts = strtok((char *)option, "="); if ((to = strtok(NULL, "="))) { if (sscanf(to, "%u", &zk_timeout) != 1) { sd_err("Invalid paramter for timeout"); return -1; } p = strstr(hosts, "timeout"); *--p = '\0'; } sd_debug("version %d.%d.%d, address %s, timeout %d", ZOO_MAJOR_VERSION, ZOO_MINOR_VERSION, ZOO_PATCH_VERSION, hosts, zk_timeout); zhandle = zookeeper_init(hosts, zk_watcher, zk_timeout, NULL, NULL, 0); if (!zhandle) { sd_err("failed to initialize zk server %s", option); return -1; } /* the simplest way to wait and check zk connection */ interval = 100; max_retry = zk_timeout / interval; while (zoo_state(zhandle) != ZOO_CONNECTED_STATE) { usleep(interval * 1000); if (++retry >= max_retry) { sd_err("failed to connect to zk server %s " "after %d retries", option, retry); return -1; } } uatomic_set_false(&stop); uatomic_set_false(&is_master); if (zk_queue_init() != ZOK) return -1; efd = eventfd(0, EFD_NONBLOCK); if (efd < 0) { sd_err("failed to create an event fd: %m"); return -1; } ret = register_event(efd, zk_event_handler, NULL); if (ret) { sd_err("failed to register zookeeper event handler (%d)", ret); return -1; } /* init distributed lock structures */ cluster_locks_table = xzalloc(sizeof(struct list_head) * HASH_BUCKET_NR); for (uint64_t i = 0; i < HASH_BUCKET_NR; i++) { INIT_HLIST_HEAD(cluster_locks_table + i); sd_init_mutex(table_locks + i); } ret = zk_init_node(LOCK_ZNODE); if (ret != ZOK) { sd_err("Failed to create %s %s", LOCK_ZNODE, zerror(ret)); free(cluster_locks_table); return -1; } return 0; } static int zk_update_node(struct sd_node *node) { struct zk_node znode = { .node = *node, }; return add_event(EVENT_UPDATE_NODE, &znode, NULL, 0); } static struct cluster_driver cdrv_zookeeper = { .name = "zookeeper", .init = zk_init, .join = zk_join, .leave = zk_leave, .notify = zk_notify, .block = zk_block, .unblock = zk_unblock, .lock = zk_lock, .unlock = zk_unlock, .update_node = zk_update_node, .get_local_addr = get_local_addr, }; cdrv_register(cdrv_zookeeper); sheepdog-0.8.3/sheep/config.c000066400000000000000000000075721237656255000160630ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #define SD_FORMAT_VERSION 0x0004 #define SD_CONFIG_SIZE 40 static struct sheepdog_config { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; uint8_t shutdown; uint8_t copy_policy; uint8_t __pad; uint16_t version; uint64_t space; } config; char *config_path; #define CONFIG_PATH "/config" static int write_config(void) { int ret; ret = atomic_create_and_write(config_path, (char *)&config, sizeof(config), true); if (ret < 0) { sd_err("atomic_create_and_write() failed"); return SD_RES_EIO; } return SD_RES_SUCCESS; } static void check_tmp_config(void) { int ret; char tmp_config_path[PATH_MAX]; snprintf(tmp_config_path, PATH_MAX, "%s.tmp", config_path); ret = unlink(tmp_config_path); if (!ret || ret != ENOENT) return; sd_info("removed temporal config file"); } static int get_cluster_config(struct cluster_info *cinfo) { cinfo->ctime = config.ctime; cinfo->nr_copies = config.copies; cinfo->flags = config.flags; cinfo->copy_policy = config.copy_policy; memcpy(cinfo->store, config.store, sizeof(config.store)); return SD_RES_SUCCESS; } int init_config_file(void) { int fd, ret; check_tmp_config(); fd = open(config_path, O_RDONLY); if (fd < 0) { if (errno != ENOENT) { sd_err("failed to read config file, %m"); return -1; } goto create; } ret = xread(fd, &config, sizeof(config)); if (ret == 0) { close(fd); goto create; } if (ret < 0) { sd_err("failed to read config file, %m"); goto out; } if (config.version != SD_FORMAT_VERSION) { sd_err("This sheep version is not compatible with" " the existing data layout, %d", config.version); if (sys->upgrade) { /* upgrade sheep store */ ret = sd_migrate_store(config.version, SD_FORMAT_VERSION); if (ret == 0) { /* reload config file */ ret = xpread(fd, &config, sizeof(config), 0); if (ret != sizeof(config)) { sd_err("failed to reload config file," " %m"); ret = -1; } else { ret = 0; goto reload; } } goto out; } sd_err("use '-u' option to upgrade sheep store"); ret = -1; goto out; } reload: ret = 0; get_cluster_config(&sys->cinfo); create: config.version = SD_FORMAT_VERSION; if (write_config() != SD_RES_SUCCESS) return -1; out: close(fd); return ret; } void init_config_path(const char *base_path) { int len = strlen(base_path) + strlen(CONFIG_PATH) + 1; config_path = xzalloc(len); snprintf(config_path, len, "%s" CONFIG_PATH, base_path); } int set_cluster_config(const struct cluster_info *cinfo) { config.ctime = cinfo->ctime; config.copies = cinfo->nr_copies; config.copy_policy = cinfo->copy_policy; config.flags = cinfo->flags; memset(config.store, 0, sizeof(config.store)); pstrcpy((char *)config.store, sizeof(config.store), (char *)cinfo->store); return write_config(); } int set_node_space(uint64_t space) { config.space = space; return write_config(); } int get_node_space(uint64_t *space) { *space = config.space; return SD_RES_SUCCESS; } bool is_cluster_formatted(void) { struct cluster_info cinfo; get_cluster_config(&cinfo); return cinfo.ctime != 0; } int set_cluster_shutdown(bool down) { config.shutdown = down; return write_config(); } bool was_cluster_shutdowned(void) { return config.shutdown; } static inline __attribute__((used)) void __sd_config_format_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sheepdog_config) != SD_CONFIG_SIZE); } sheepdog-0.8.3/sheep/gateway.c000066400000000000000000000366261237656255000162610ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2012-2013 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" static inline void gateway_init_fwd_hdr(struct sd_req *fwd, struct sd_req *hdr) { memcpy(fwd, hdr, sizeof(*fwd)); fwd->opcode = gateway_to_peer_opcode(hdr->opcode); fwd->proto_ver = SD_SHEEP_PROTO_VER; } struct req_iter { uint8_t *buf; uint32_t wlen; uint32_t dlen; uint64_t off; }; static struct req_iter *prepare_replication_requests(struct request *req, int *nr) { int nr_copies = get_req_copy_number(req); void *data = req->data; uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_copies); sd_debug("%"PRIx64, req->rq.obj.oid); *nr = nr_copies; for (int i = 0; i < nr_copies; i++) { reqs[i].buf = data; reqs[i].dlen = len; reqs[i].off = off; reqs[i].wlen = len; } return reqs; } /* * Make sure we don't overwrite the existing data for misaligned write * * If either offset or length of request isn't aligned to * SD_EC_DATA_STRIPE_SIZE, we have to read the unaligned blocks before write. * This kind of write amplification indeed slow down the write operation with * extra read overhead. */ static void *init_erasure_buffer(struct request *req, int buf_len) { char *buf = xvalloc(buf_len); uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; uint64_t oid = req->rq.obj.oid; int opcode = req->rq.opcode; struct sd_req hdr; uint64_t head = round_down(off, SD_EC_DATA_STRIPE_SIZE); uint64_t tail = round_down(off + len, SD_EC_DATA_STRIPE_SIZE); int ret; if (opcode != SD_OP_WRITE_OBJ) goto out; if (off % SD_EC_DATA_STRIPE_SIZE) { /* Read head */ sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.obj.oid = oid; hdr.data_length = SD_EC_DATA_STRIPE_SIZE; hdr.obj.offset = head; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) { free(buf); return NULL; } } if ((len + off) % SD_EC_DATA_STRIPE_SIZE && tail - head > 0) { /* Read tail */ sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.obj.oid = oid; hdr.data_length = SD_EC_DATA_STRIPE_SIZE; hdr.obj.offset = tail; ret = exec_local_req(&hdr, buf + tail - head); if (ret != SD_RES_SUCCESS) { free(buf); return NULL; } } out: memcpy(buf + off % SD_EC_DATA_STRIPE_SIZE, req->data, len); return buf; } /* * We spread data strips of req along with its parity strips onto replica for * write opertaion. For read we only need to prepare data strip buffers. */ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr) { uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; struct fec *ctx; int strip_size, nr_to_send; struct req_iter *reqs; char *p, *buf = NULL; uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, ep = 0, edp; edp = ec_policy_to_dp(policy, &ed, &ep); ctx = ec_init(ed, edp); *nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp; strip_size = SD_EC_DATA_STRIPE_SIZE / ed; reqs = xzalloc(sizeof(*reqs) * nr_to_send); sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); for (i = 0; i < nr_to_send; i++) { int l = strip_size * nr_stripe; reqs[i].buf = xmalloc(l); reqs[i].dlen = l; reqs[i].off = start * strip_size; switch (opcode) { case SD_OP_CREATE_AND_WRITE_OBJ: case SD_OP_WRITE_OBJ: reqs[i].wlen = l; break; default: break; } } if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ) goto out; /* Read and remove operation */ p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe); if (!buf) { sd_err("failed to init erasure buffer %"PRIx64, req->rq.obj.oid); free(reqs); reqs = NULL; goto out; } for (i = 0; i < nr_stripe; i++) { const uint8_t *ds[ed]; uint8_t *ps[ep]; for (j = 0; j < ed; j++) ds[j] = reqs[j].buf + strip_size * i; for (j = 0; j < ep; j++) ps[j] = reqs[ed + j].buf + strip_size * i; for (j = 0; j < ed; j++) memcpy((uint8_t *)ds[j], p + j * strip_size, strip_size); ec_encode(ctx, ds, ps); p += SD_EC_DATA_STRIPE_SIZE; } out: ec_destroy(ctx); free(buf); return reqs; } bool is_erasure_oid(uint64_t oid) { return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) && get_vdi_copy_policy(oid_to_vid(oid)) > 0; } /* Prepare request iterator and buffer for each replica */ static struct req_iter *prepare_requests(struct request *req, int *nr) { if (is_erasure_oid(req->rq.obj.oid)) return prepare_erasure_requests(req, nr); else return prepare_replication_requests(req, nr); } static void finish_requests(struct request *req, struct req_iter *reqs, int nr_to_send) { uint64_t oid = req->rq.obj.oid; uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; if (!is_erasure_oid(oid)) goto out; sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); /* We need to assemble the data strips into the req buffer for read */ if (opcode == SD_OP_READ_OBJ) { char *p, *buf = xmalloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe); uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, strip_size; ec_policy_to_dp(policy, &ed, NULL); strip_size = SD_EC_DATA_STRIPE_SIZE / ed; p = buf; for (i = 0; i < nr_stripe; i++) { for (j = 0; j < nr_to_send; j++) { memcpy(p, reqs[j].buf + strip_size * i, strip_size); p += strip_size; } } memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len); req->rp.data_length = req->rq.data_length; free(buf); } for (i = 0; i < nr_to_send; i++) free(reqs[i].buf); out: free(reqs); } /* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ static int gateway_replication_read(struct request *req) { int i, ret = SD_RES_SUCCESS; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; nr_copies = get_req_copy_number(req); oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) goto out; sd_err("local read %"PRIx64" failed, %s", oid, sd_strerror(ret)); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); ret = sheep_exec_req(&v->node->nid, &fwd_hdr, req->data); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } out: return ret; } struct forward_info_entry { struct pollfd pfd; const struct node_id *nid; struct sockfd *sfd; void *buf; }; struct forward_info { struct forward_info_entry ent[SD_MAX_NODES]; int nr_sent; }; static inline void forward_info_update(struct forward_info *fi, int pos) { sd_debug("%d, %d", fi->nr_sent, pos); fi->nr_sent--; memmove(fi->ent + pos, fi->ent + pos + 1, sizeof(struct forward_info_entry) * (fi->nr_sent - pos)); } static inline void finish_one_entry(struct forward_info *fi, int i) { sockfd_cache_put(fi->ent[i].nid, fi->ent[i].sfd); forward_info_update(fi, i); } static inline void finish_one_entry_err(struct forward_info *fi, int i) { sockfd_cache_del(fi->ent[i].nid, fi->ent[i].sfd); forward_info_update(fi, i); } static inline struct forward_info_entry * forward_info_find(struct forward_info *fi, int fd) { for (int i = 0; i < fi->nr_sent; i++) if (fi->ent[i].pfd.fd == fd) return &fi->ent[i]; panic("can't find entry for %d", fd); return NULL; } struct pfd_info { struct pollfd pfds[SD_MAX_NODES]; int nr; }; static inline void pfd_info_init(struct forward_info *fi, struct pfd_info *pi) { int i; for (i = 0; i < fi->nr_sent; i++) pi->pfds[i] = fi->ent[i].pfd; pi->nr = fi->nr_sent; } /* * Wait for all forward requests completion. * * Even if something goes wrong, we have to wait forward requests completion to * avoid interleaved requests. * * Return error code if any one request fails. */ static int wait_forward_request(struct forward_info *fi, struct request *req) { int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, repeat = MAX_RETRY_COUNT; struct pfd_info pi; struct sd_rsp *rsp = &req->rp; again: pfd_info_init(fi, &pi); pollret = poll(pi.pfds, pi.nr, 1000 * POLL_TIMEOUT); if (pollret < 0) { if (errno == EINTR) goto again; panic("%m"); } else if (pollret == 0) { /* * If IO NIC is down, epoch isn't incremented, so we can't retry * for ever. */ if (sheep_need_retry(req->rq.epoch) && repeat) { repeat--; sd_warn("poll timeout %d, disks of some nodes or " "network is busy. Going to poll-wait again", fi->nr_sent); goto again; } nr_sent = fi->nr_sent; /* XXX Blinedly close all the connections */ for (i = 0; i < nr_sent; i++) sockfd_cache_del(fi->ent[i].nid, fi->ent[i].sfd); return SD_RES_NETWORK_ERROR; } nr_sent = fi->nr_sent; for (i = 0; i < nr_sent; i++) if (pi.pfds[i].revents & POLLIN) break; if (i < nr_sent) { int re = pi.pfds[i].revents; sd_debug("%d, revents %x", i, re); if (re & (POLLERR | POLLHUP | POLLNVAL)) { err_ret = SD_RES_NETWORK_ERROR; finish_one_entry_err(fi, i); goto out; } if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp), sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT)) { sd_err("remote node might have gone away"); err_ret = SD_RES_NETWORK_ERROR; finish_one_entry_err(fi, i); goto out; } if (rsp->data_length) { struct forward_info_entry *ent; ent = forward_info_find(fi, pi.pfds[i].fd); if (do_read(pi.pfds[i].fd, ent->buf, rsp->data_length, sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT)) { sd_err("remote node might have gone away"); err_ret = SD_RES_NETWORK_ERROR; finish_one_entry_err(fi, i); goto out; } } ret = rsp->result; if (ret != SD_RES_SUCCESS) { sd_err("fail %"PRIx64", %s", req->rq.obj.oid, sd_strerror(ret)); err_ret = ret; } finish_one_entry(fi, i); } out: if (fi->nr_sent > 0) goto again; return err_ret; } static inline void forward_info_init(struct forward_info *fi, size_t nr_to_send) { int i; for (i = 0; i < nr_to_send; i++) fi->ent[i].pfd.fd = -1; fi->nr_sent = 0; } static inline void forward_info_advance(struct forward_info *fi, const struct node_id *nid, struct sockfd *sfd, void *buf) { fi->ent[fi->nr_sent].nid = nid; fi->ent[fi->nr_sent].pfd.fd = sfd->fd; fi->ent[fi->nr_sent].pfd.events = POLLIN; fi->ent[fi->nr_sent].sfd = sfd; fi->ent[fi->nr_sent].buf = buf; fi->nr_sent++; } static int gateway_forward_request(struct request *req) { int i, err_ret = SD_RES_SUCCESS, ret; unsigned wlen; uint64_t oid = req->rq.obj.oid; struct forward_info fi; struct sd_req hdr; const struct sd_node *target_nodes[SD_MAX_NODES]; int nr_copies = get_req_copy_number(req), nr_reqs, nr_to_send = 0; struct req_iter *reqs = NULL; sd_debug("%"PRIx64, oid); gateway_init_fwd_hdr(&hdr, &req->rq); oid_to_nodes(oid, &req->vinfo->vroot, nr_copies, target_nodes); forward_info_init(&fi, nr_copies); reqs = prepare_requests(req, &nr_to_send); if (!reqs) return SD_RES_NETWORK_ERROR; /* * For replication, we send number of available zones copies. * * For erasure, we need at least number of data strips to send to avoid * overflow of target_nodes. */ nr_reqs = nr_to_send; if (nr_to_send > nr_copies) { uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ds; /* Only for erasure code, nr_to_send might > nr_copies */ ec_policy_to_dp(policy, &ds, NULL); if (nr_copies < ds) { sd_err("There isn't enough copies(%d) to send out (%d)", nr_copies, nr_to_send); err_ret = SD_RES_SYSTEM_ERROR; goto out; } nr_to_send = ds; } for (i = 0; i < nr_to_send; i++) { struct sockfd *sfd; const struct node_id *nid; nid = &target_nodes[i]->nid; sfd = sockfd_cache_get(nid); if (!sfd) { err_ret = SD_RES_NETWORK_ERROR; break; } hdr.data_length = reqs[i].dlen; wlen = reqs[i].wlen; hdr.obj.offset = reqs[i].off; hdr.obj.ec_index = i; hdr.obj.copy_policy = req->rq.obj.copy_policy; ret = send_req(sfd->fd, &hdr, reqs[i].buf, wlen, sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT); if (ret) { sockfd_cache_del_node(nid); err_ret = SD_RES_NETWORK_ERROR; sd_debug("fail %d", ret); break; } forward_info_advance(&fi, nid, sfd, reqs[i].buf); } sd_debug("nr_sent %d, err %x", fi.nr_sent, err_ret); if (fi.nr_sent > 0) { ret = wait_forward_request(&fi, req); if (ret != SD_RES_SUCCESS) err_ret = ret; } out: finish_requests(req, reqs, nr_reqs); return err_ret; } int gateway_read_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; if (!bypass_object_cache(req)) return object_cache_handle_request(req); if (is_erasure_oid(oid)) return gateway_forward_request(req); else return gateway_replication_read(req); } int gateway_write_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; if (oid_is_readonly(oid)) return SD_RES_READONLY; if (!bypass_object_cache(req)) return object_cache_handle_request(req); return gateway_forward_request(req); } static int gateway_handle_cow(struct request *req) { uint64_t oid = req->rq.obj.oid; size_t len = get_objsize(oid); struct sd_req hdr, *req_hdr = &req->rq; char *buf = xvalloc(len); int ret; if (req->rq.data_length != len) { /* Partial write, need read the copy first */ sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.obj.oid = req_hdr->obj.cow_oid; hdr.data_length = len; hdr.obj.offset = 0; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) goto out; } memcpy(buf + req_hdr->obj.offset, req->data, req_hdr->data_length); sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.obj.oid = oid; hdr.data_length = len; hdr.obj.offset = 0; ret = exec_local_req(&hdr, buf); out: free(buf); return ret; } int gateway_create_and_write_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; if (oid_is_readonly(oid)) return SD_RES_READONLY; if (req->rq.flags & SD_FLAG_CMD_COW) return gateway_handle_cow(req); if (!bypass_object_cache(req)) return object_cache_handle_request(req); return gateway_forward_request(req); } int gateway_remove_obj(struct request *req) { return gateway_forward_request(req); } sheepdog-0.8.3/sheep/group.c000066400000000000000000000654461237656255000157560ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct node { struct sd_node ent; struct list_node list; }; struct get_vdis_work { struct work work; DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); struct sd_node joined; struct rb_root nroot; }; static struct sd_mutex wait_vdis_lock = SD_MUTEX_INITIALIZER; static struct sd_cond wait_vdis_cond = SD_COND_INITIALIZER; static refcnt_t nr_get_vdis_works; static main_thread(struct vnode_info *) current_vnode_info; static main_thread(struct list_head *) pending_block_list; static main_thread(struct list_head *) pending_notify_list; static int get_zones_nr_from(struct rb_root *nroot) { int nr_zones = 0, j; uint32_t zones[SD_MAX_COPIES]; struct sd_node *n; rb_for_each_entry(n, nroot, rb) { /* * Only count zones that actually store data, pure gateways * don't contribute to the redundancy level. */ if (!n->nr_vnodes) continue; for (j = 0; j < nr_zones; j++) { if (n->zone == zones[j]) break; } if (j == nr_zones) { zones[nr_zones] = n->zone; if (++nr_zones == ARRAY_SIZE(zones)) break; } } return nr_zones; } /* * Grab an additional reference to the passed in vnode info. * * The caller must already hold a reference to vnode_info, this function must * only be used to grab an additional reference from code that wants the * vnode information to outlive the request structure. */ struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info) { refcount_inc(&vnode_info->refcnt); return vnode_info; } /* * Get a reference to the currently active vnode information structure, * this must only be called from the main thread. * This can return NULL if cluster is not started yet. */ main_fn struct vnode_info *get_vnode_info(void) { struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info); if (cur_vinfo == NULL) return NULL; return grab_vnode_info(cur_vinfo); } /* Release a reference to the current vnode information. */ void put_vnode_info(struct vnode_info *vnode_info) { if (vnode_info) { if (refcount_dec(&vnode_info->refcnt) == 0) { rb_destroy(&vnode_info->vroot, struct sd_vnode, rb); rb_destroy(&vnode_info->nroot, struct sd_node, rb); free(vnode_info); } } } static void recalculate_vnodes(struct rb_root *nroot) { int nr_non_gateway_nodes = 0; uint64_t avg_size = 0; struct sd_node *n; float factor; rb_for_each_entry(n, nroot, rb) { if (n->space) { avg_size += n->space; nr_non_gateway_nodes++; } } if (!nr_non_gateway_nodes) return; avg_size /= nr_non_gateway_nodes; rb_for_each_entry(n, nroot, rb) { factor = (float)n->space / (float)avg_size; n->nr_vnodes = rintf(SD_DEFAULT_VNODES * factor); sd_debug("node %s has %d vnodes, free space %" PRIu64, node_to_str(n), n->nr_vnodes, n->space); } } struct vnode_info *alloc_vnode_info(const struct rb_root *nroot) { struct vnode_info *vnode_info; struct sd_node *n; vnode_info = xzalloc(sizeof(*vnode_info)); INIT_RB_ROOT(&vnode_info->vroot); INIT_RB_ROOT(&vnode_info->nroot); rb_for_each_entry(n, nroot, rb) { struct sd_node *new = xmalloc(sizeof(*new)); *new = *n; if (unlikely(rb_insert(&vnode_info->nroot, new, rb, node_cmp))) panic("node hash collision"); vnode_info->nr_nodes++; } recalculate_vnodes(&vnode_info->nroot); nodes_to_vnodes(&vnode_info->nroot, &vnode_info->vroot); vnode_info->nr_zones = get_zones_nr_from(&vnode_info->nroot); refcount_set(&vnode_info->refcnt, 1); return vnode_info; } struct vnode_info *get_vnode_info_epoch(uint32_t epoch, struct vnode_info *cur_vinfo) { struct sd_node nodes[SD_MAX_NODES]; struct rb_root nroot = RB_ROOT; int nr_nodes; nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes)); if (nr_nodes < 0) { nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes), NULL, cur_vinfo); if (nr_nodes == 0) return NULL; } for (int i = 0; i < nr_nodes; i++) rb_insert(&nroot, &nodes[i], rb, node_cmp); return alloc_vnode_info(&nroot); } int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int nr_nodes; struct vnode_info *cur_vinfo = get_vnode_info(); if (cur_vinfo) { nr_nodes = cur_vinfo->nr_nodes; nodes_to_buffer(&cur_vinfo->nroot, data); rsp->data_length = nr_nodes * sizeof(struct sd_node); rsp->node.nr_nodes = nr_nodes; put_vnode_info(cur_vinfo); } else { rsp->node.nr_nodes = 0; } return SD_RES_SUCCESS; } /* Indicator if a cluster operation is currently running. */ static bool cluster_op_running; static struct vdi_op_message *prepare_cluster_msg(struct request *req, size_t *sizep) { struct vdi_op_message *msg; size_t size; if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE) /* notify data that was received from the sender */ size = sizeof(*msg) + req->rq.data_length; else /* notify data that was set in process_work */ size = sizeof(*msg) + req->rp.data_length; assert(size <= SD_MAX_EVENT_BUF_SIZE); msg = xzalloc(size); memcpy(&msg->req, &req->rq, sizeof(struct sd_req)); memcpy(&msg->rsp, &req->rp, sizeof(struct sd_rsp)); if (has_process_main(req->op) && size > sizeof(*msg)) memcpy(msg->data, req->data, size - sizeof(*msg)); *sizep = size; return msg; } static void cluster_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); struct vdi_op_message *msg; size_t size; int ret; if (req->status == REQUEST_DROPPED) goto drop; sd_debug("%s (%p)", op_name(req->op), req); msg = prepare_cluster_msg(req, &size); ret = sys->cdrv->unblock(msg, size); if (ret != SD_RES_SUCCESS) { /* * Failed to unblock, shoot myself to let other sheep * unblock the event. * FIXME: handle it gracefully. */ sd_emerg("Failed to unblock, %s, exiting.", sd_strerror(ret)); exit(1); } free(msg); req->status = REQUEST_DONE; return; drop: list_del(&req->pending_list); req->rp.result = SD_RES_CLUSTER_ERROR; put_request(req); cluster_op_running = false; } /* * Perform a blocked cluster operation if we were the node requesting it * and do not have any other operation pending. * * If this method returns false the caller must call the method again for * the same event once it gets notified again. * * Must run in the main thread as it accesses unlocked state like * sys->pending_list. */ main_fn bool sd_block_handler(const struct sd_node *sender) { struct request *req; if (!node_is_local(sender)) return false; if (cluster_op_running) return false; cluster_op_running = true; req = list_first_entry(main_thread_get(pending_block_list), struct request, pending_list); req->work.fn = do_process_work; req->work.done = cluster_op_done; queue_work(sys->block_wqueue, &req->work); req->status = REQUEST_QUEUED; return true; } /* * Execute a cluster operation by letting the cluster driver send it to all * nodes in the cluster. * * Must run in the main thread as it access unlocked state like * sys->pending_list. */ main_fn void queue_cluster_request(struct request *req) { int ret; sd_debug("%s (%p)", op_name(req->op), req); if (has_process_work(req->op)) { ret = sys->cdrv->block(); if (ret != SD_RES_SUCCESS) { sd_err("failed to broadcast block to cluster, %s", sd_strerror(ret)); goto error; } list_add_tail(&req->pending_list, main_thread_get(pending_block_list)); } else { struct vdi_op_message *msg; size_t size; msg = prepare_cluster_msg(req, &size); msg->rsp.result = SD_RES_SUCCESS; ret = sys->cdrv->notify(msg, size); if (ret != SD_RES_SUCCESS) { sd_err("failed to broadcast notify to cluster, %s", sd_strerror(ret)); goto error; } list_add_tail(&req->pending_list, main_thread_get(pending_notify_list)); free(msg); } req->status = REQUEST_INIT; return; error: req->rp.result = ret; put_request(req); } int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, time_t *timestamp, struct vnode_info *vinfo) { char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)]; const struct sd_node *node; int ret; rb_for_each_entry(node, &vinfo->nroot, rb) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int nodes_len; if (node_is_local(node)) continue; sd_init_req(&hdr, SD_OP_GET_EPOCH); hdr.data_length = len; hdr.obj.tgt_epoch = epoch; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&node->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) continue; nodes_len = rsp->data_length - sizeof(*timestamp); memcpy((void *)nodes, buf, nodes_len); if (timestamp) memcpy(timestamp, buf + nodes_len, sizeof(*timestamp)); return nodes_len / sizeof(struct sd_node); } /* * If no node has targeted epoch log, return 0 here to at least * allow reading older epoch logs. */ return 0; } static bool cluster_ctime_check(const struct cluster_info *cinfo) { if (cinfo->epoch == 0 || sys->cinfo.epoch == 0) return true; if (cinfo->ctime != sys->cinfo.ctime) { sd_err("joining node ctime doesn't match: %" PRIu64 " vs %" PRIu64, cinfo->ctime, sys->cinfo.ctime); return false; } return true; } /* * Check whether enough node members are gathered. * * Sheepdog can start automatically if and only if all the members in the latest * epoch are gathered. */ static bool enough_nodes_gathered(struct cluster_info *cinfo, const struct sd_node *joining, const struct rb_root *nroot, size_t nr_nodes) { for (int i = 0; i < cinfo->nr_nodes; i++) { const struct sd_node *key = cinfo->nodes + i, *n; n = rb_search(nroot, key, rb, node_cmp); if (n == NULL && !node_eq(key, joining)) { sd_debug("%s doesn't join yet", node_to_str(key)); return false; } } sd_debug("all the nodes are gathered, %d, %zd", cinfo->nr_nodes, nr_nodes); return true; } /* * We have to use memcpy beause some cluster drivers like corosync can't support * to send the whole cluster_info structure. */ static void cluster_info_copy(struct cluster_info *dst, const struct cluster_info *src) { int len = offsetof(struct cluster_info, nodes) + src->nr_nodes * sizeof(struct sd_node); memcpy(dst, src, len); } static enum sd_status cluster_wait_check(const struct sd_node *joining, const struct rb_root *nroot, size_t nr_nodes, struct cluster_info *cinfo) { if (!cluster_ctime_check(cinfo)) { sd_debug("joining node is invalid"); return sys->cinfo.status; } if (cinfo->epoch > sys->cinfo.epoch) { sd_debug("joining node has a larger epoch, %" PRIu32 ", %" PRIu32, cinfo->epoch, sys->cinfo.epoch); cluster_info_copy(&sys->cinfo, cinfo); } /* * If we have all members from the last epoch log in the in-memory * node list, we can set the cluster live now. */ if (sys->cinfo.epoch > 0 && enough_nodes_gathered(&sys->cinfo, joining, nroot, nr_nodes)) return SD_STATUS_OK; return sys->cinfo.status; } static int get_vdis_from(struct sd_node *node) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct vdi_state *vs = NULL; int i, ret = SD_RES_SUCCESS; unsigned int rlen; int count; if (node_is_local(node)) goto out; rlen = SD_DATA_OBJ_SIZE; /* FIXME */ vs = xzalloc(rlen); sd_init_req(&hdr, SD_OP_GET_VDI_COPIES); hdr.data_length = rlen; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&node->nid, &hdr, (char *)vs); if (ret != SD_RES_SUCCESS) goto out; count = rsp->data_length / sizeof(*vs); for (i = 0; i < count; i++) { atomic_set_bit(vs[i].vid, sys->vdi_inuse); add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot, vs[i].copy_policy); } out: free(vs); return ret; } static void do_get_vdis(struct work *work) { struct get_vdis_work *w = container_of(work, struct get_vdis_work, work); struct sd_node *n; int ret; if (!node_is_local(&w->joined)) { sd_debug("try to get vdi bitmap from %s", node_to_str(&w->joined)); ret = get_vdis_from(&w->joined); if (ret != SD_RES_SUCCESS) sd_alert("failed to get vdi bitmap from %s", node_to_str(&w->joined)); return; } rb_for_each_entry(n, &w->nroot, rb) { /* We should not fetch vdi_bitmap and copy list from myself */ if (node_is_local(n)) continue; sd_debug("try to get vdi bitmap from %s", node_to_str(n)); ret = get_vdis_from(n); if (ret != SD_RES_SUCCESS) { /* try to read from another node */ sd_alert("failed to get vdi bitmap from %s", node_to_str(n)); continue; } /* * TODO: If the target node has a valid vdi bitmap (the node has * already called do_get_vdis against all the nodes), we can * exit this loop here. */ } } static void get_vdis_done(struct work *work) { struct get_vdis_work *w = container_of(work, struct get_vdis_work, work); sd_mutex_lock(&wait_vdis_lock); refcount_dec(&nr_get_vdis_works); sd_cond_broadcast(&wait_vdis_cond); sd_mutex_unlock(&wait_vdis_lock); rb_destroy(&w->nroot, struct sd_node, rb); free(w); } int inc_and_log_epoch(void) { struct vnode_info *cur_vinfo = get_vnode_info(); if (cur_vinfo) { /* update cluster info to the latest state */ sys->cinfo.nr_nodes = cur_vinfo->nr_nodes; nodes_to_buffer(&cur_vinfo->nroot, sys->cinfo.nodes); put_vnode_info(cur_vinfo); } else sys->cinfo.nr_nodes = 0; uatomic_inc(&sys->cinfo.epoch); return update_epoch_log(sys->cinfo.epoch, sys->cinfo.nodes, sys->cinfo.nr_nodes); } static struct vnode_info *alloc_old_vnode_info(void) { struct rb_root old_root = RB_ROOT; struct vnode_info *old; /* * If the previous cluster has failed node, (For example, 3 good nodes * and 1 failed node), the 'nroot' will present 4 good nodes after * shutdown and restart this 4 nodes cluster, this is incorrect. * We should use old nodes information which is stored in epoch to * rebuild old_vnode_info. */ for (int i = 0; i < sys->cinfo.nr_nodes; i++) { struct sd_node *new = xmalloc(sizeof(*new)); *new = sys->cinfo.nodes[i]; if (rb_insert(&old_root, new, rb, node_cmp)) panic("node hash collision"); } old = alloc_vnode_info(&old_root); rb_destroy(&old_root, struct sd_node, rb); return old; } static void setup_backend_store(const struct cluster_info *cinfo) { int ret; if (cinfo->store[0] == '\0') return; if (!sd_store) { sd_store = find_store_driver((char *)cinfo->store); if (!sd_store) panic("backend store %s not supported", cinfo->store); ret = sd_store->init(); if (ret != SD_RES_SUCCESS) panic("failed to initialize store"); } /* * We need to purge the stale objects for sheep joining back * after crash */ if (xlfind(&sys->this_node, cinfo->nodes, cinfo->nr_nodes, node_cmp) == NULL) { ret = sd_store->purge_obj(); if (ret != SD_RES_SUCCESS) panic("can't remove stale objects"); } } static void get_vdis(const struct rb_root *nroot, const struct sd_node *joined) { struct get_vdis_work *w; w = xmalloc(sizeof(*w)); w->joined = *joined; INIT_RB_ROOT(&w->nroot); rb_copy(nroot, struct sd_node, rb, &w->nroot, node_cmp); refcount_inc(&nr_get_vdis_works); w->work.fn = do_get_vdis; w->work.done = get_vdis_done; queue_work(sys->block_wqueue, &w->work); } void wait_get_vdis_done(void) { sd_debug("waiting for vdi list"); sd_mutex_lock(&wait_vdis_lock); while (refcount_read(&nr_get_vdis_works) > 0) sd_cond_wait(&wait_vdis_cond, &wait_vdis_lock); sd_mutex_unlock(&wait_vdis_lock); sd_debug("vdi list ready"); } static void update_cluster_info(const struct cluster_info *cinfo, const struct sd_node *joined, const struct rb_root *nroot, size_t nr_nodes) { struct vnode_info *old_vnode_info; sd_debug("status = %d, epoch = %d", cinfo->status, cinfo->epoch); if (!sys->gateway_only) setup_backend_store(cinfo); if (node_is_local(joined)) sockfd_cache_add_group(nroot); sockfd_cache_add(&joined->nid); /* * We need use main_thread_get() to obtain current_vnode_info. The * reference count of old_vnode_info is decremented at the last of this * function in order to release old_vnode_info. The counter part * of this dereference is alloc_vnode_info(). */ old_vnode_info = main_thread_get(current_vnode_info); main_thread_set(current_vnode_info, alloc_vnode_info(nroot)); get_vdis(nroot, joined); if (cinfo->status == SD_STATUS_OK) { if (!is_cluster_formatted()) /* initialize config file */ set_cluster_config(&sys->cinfo); if (nr_nodes != cinfo->nr_nodes) { int ret; if (old_vnode_info) put_vnode_info(old_vnode_info); old_vnode_info = alloc_old_vnode_info(); ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), old_vnode_info, true); } else if (!was_cluster_shutdowned()) { start_recovery(main_thread_get(current_vnode_info), main_thread_get(current_vnode_info), false); } set_cluster_shutdown(false); } put_vnode_info(old_vnode_info); } /* * Pass on a notification message from the cluster driver. * * Must run in the main thread as it accesses unlocked state like * sys->pending_list. */ main_fn void sd_notify_handler(const struct sd_node *sender, void *data, size_t data_len) { struct vdi_op_message *msg = data; const struct sd_op_template *op = get_sd_op(msg->req.opcode); int ret = msg->rsp.result; struct request *req = NULL; sd_debug("op %s, size: %zu, from: %s", op_name(op), data_len, node_to_str(sender)); if (node_is_local(sender)) { if (has_process_work(op)) req = list_first_entry( main_thread_get(pending_block_list), struct request, pending_list); else req = list_first_entry( main_thread_get(pending_notify_list), struct request, pending_list); list_del(&req->pending_list); } if (ret == SD_RES_SUCCESS && has_process_main(op)) ret = do_process_main(op, &msg->req, &msg->rsp, msg->data); if (req) { msg->rsp.result = ret; if (has_process_main(req->op) && !(req->rq.flags & SD_FLAG_CMD_WRITE)) memcpy(req->data, msg->data, msg->rsp.data_length); memcpy(&req->rp, &msg->rsp, sizeof(req->rp)); put_request(req); } if (has_process_work(op)) cluster_op_running = false; } /* * Accept the joining node and pass the cluster info to it. * * Note that 'nodes' doesn't contain 'joining'. * * Return true if the joining node is accepted. At least one nodes in the * cluster must call this function and succeed in accept of the joining node. */ main_fn bool sd_join_handler(const struct sd_node *joining, const struct rb_root *nroot, size_t nr_nodes, void *opaque) { struct cluster_info *cinfo = opaque; enum sd_status status; /* * If nr_nodes is 0, the joining node is the first member of the cluster * and joins sheepdog successfully without any check. If nr_nodes is * not 0, the joining node has to wait for another node to accept it. */ if (nr_nodes > 0 && node_is_local(joining)) { sd_debug("wait for another node to accept this node"); return false; } sd_debug("check %s, %d", node_to_str(joining), sys->cinfo.status); if (sys->cinfo.status == SD_STATUS_WAIT) status = cluster_wait_check(joining, nroot, nr_nodes, cinfo); else status = sys->cinfo.status; cluster_info_copy(cinfo, &sys->cinfo); cinfo->status = status; cinfo->proto_ver = SD_SHEEP_PROTO_VER; sd_debug("%s: cluster_status = 0x%x", addr_to_str(joining->nid.addr, joining->nid.port), cinfo->status); return true; } static int send_join_request(void) { struct sd_node *n = &sys->this_node; sd_info("%s going to join the cluster", node_to_str(n)); return sys->cdrv->join(n, &sys->cinfo, sizeof(sys->cinfo)); } static void requeue_cluster_request(void) { struct request *req; struct vdi_op_message *msg; size_t size; list_for_each_entry(req, main_thread_get(pending_notify_list), pending_list) { /* * ->notify() was called and succeeded but after that * this node session-timeouted and sd_notify_handler * wasn't called from notify event handler in cluster * driver. We manually call sd_notify_handler to finish * the request. */ sd_debug("finish pending notify request, op: %s", op_name(req->op)); msg = prepare_cluster_msg(req, &size); sd_notify_handler(&sys->this_node, msg, size); free(msg); } list_for_each_entry(req, main_thread_get(pending_block_list), pending_list) { switch (req->status) { case REQUEST_INIT: /* this request has never been executed, re-queue it */ sd_debug("requeue a block request, op: %s", op_name(req->op)); list_del(&req->pending_list); queue_cluster_request(req); break; case REQUEST_QUEUED: /* * This request is being handled by the 'block' thread * and ->unblock() isn't called yet. We can't call * ->unblock thereafter because other sheep has * unblocked themselves due to cluster driver session * timeout. Mark it as dropped to stop cluster_op_done() * from calling ->unblock. */ sd_debug("drop pending block request, op: %s", op_name(req->op)); req->status = REQUEST_DROPPED; break; case REQUEST_DONE: /* * ->unblock() was called and succeeded but after that * this node session-timeouted and sd_notify_handler * wasn't called from unblock event handler in cluster * driver. We manually call sd_notify_handler to finish * the request. */ sd_debug("finish pending block request, op: %s", op_name(req->op)); msg = prepare_cluster_msg(req, &size); sd_notify_handler(&sys->this_node, msg, size); free(msg); break; default: break; } } } main_fn int sd_reconnect_handler(void) { sys->cinfo.status = SD_STATUS_WAIT; if (sys->cdrv->init(sys->cdrv_option) != 0) return -1; if (send_join_request() != 0) return -1; requeue_cluster_request(); return 0; } static bool cluster_join_check(const struct cluster_info *cinfo) { if (cinfo->proto_ver != SD_SHEEP_PROTO_VER) { sd_err("invalid protocol version: %d, %d", cinfo->proto_ver, SD_SHEEP_PROTO_VER); return false; } if (!cluster_ctime_check(cinfo)) return false; /* * Sheepdog's recovery code assumes every node have the same epoch * history. But we don't check epoch history of joining node because: * 1. inconsist epoch history only happens in the network partition case * for the corosync driver, but corosync driver will panic for such * case to prevent epoch inconsistency. * 2. checking epoch history with joining node is too expensive and is * unneeded for zookeeper driver. * * That said, we don't check epoch history at all. */ return true; } main_fn void sd_accept_handler(const struct sd_node *joined, const struct rb_root *nroot, size_t nr_nodes, const void *opaque) { const struct cluster_info *cinfo = opaque; struct sd_node *n; if (node_is_local(joined) && !cluster_join_check(cinfo)) { sd_err("failed to join Sheepdog"); exit(1); } cluster_info_copy(&sys->cinfo, cinfo); sd_debug("join %s", node_to_str(joined)); rb_for_each_entry(n, nroot, rb) { sd_debug("%s", node_to_str(n)); } if (sys->cinfo.status == SD_STATUS_SHUTDOWN) return; update_cluster_info(cinfo, joined, nroot, nr_nodes); if (node_is_local(joined)) /* this output is used for testing */ sd_debug("join Sheepdog cluster"); } main_fn void sd_leave_handler(const struct sd_node *left, const struct rb_root *nroot, size_t nr_nodes) { struct vnode_info *old_vnode_info; struct sd_node *n; int ret; sd_debug("leave %s", node_to_str(left)); rb_for_each_entry(n, nroot, rb) { sd_debug("%s", node_to_str(n)); } if (sys->cinfo.status == SD_STATUS_SHUTDOWN) return; if (node_is_local(left)) /* Mark leave node as gateway only node */ sys->this_node.nr_vnodes = 0; /* * Using main_thread_get() instead of get_vnode_info() is allowed * because of the same reason of update_cluster_info() */ old_vnode_info = main_thread_get(current_vnode_info); main_thread_set(current_vnode_info, alloc_vnode_info(nroot)); if (sys->cinfo.status == SD_STATUS_OK) { ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), old_vnode_info, true); } put_vnode_info(old_vnode_info); sockfd_cache_del_node(&left->nid); } static void update_node_size(struct sd_node *node) { struct vnode_info *cur_vinfo = get_vnode_info(); struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp); if (unlikely(!n)) panic("can't find %s", node_to_str(node)); n->space = node->space; put_vnode_info(cur_vinfo); } static void kick_node_recover(void) { /* * Using main_thread_get() instead of get_vnode_info() is allowed * because of the same reason of update_cluster_info() */ struct vnode_info *old = main_thread_get(current_vnode_info); int ret; main_thread_set(current_vnode_info, alloc_vnode_info(&old->nroot)); ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), old, true); put_vnode_info(old); } main_fn void sd_update_node_handler(struct sd_node *node) { update_node_size(node); kick_node_recover(); } int create_cluster(int port, int64_t zone, int nr_vnodes, bool explicit_addr) { int ret; if (!sys->cdrv) { sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER); sd_debug("use %s cluster driver as default", DEFAULT_CLUSTER_DRIVER); } ret = sys->cdrv->init(sys->cdrv_option); if (ret < 0) return -1; if (!explicit_addr) { ret = sys->cdrv->get_local_addr(sys->this_node.nid.addr); if (ret < 0) return -1; } sys->this_node.nid.port = port; sys->this_node.nr_vnodes = nr_vnodes; if (zone == -1) { /* use last 4 bytes as zone id */ uint8_t *b = sys->this_node.nid.addr + 12; sys->this_node.zone = b[0] | b[1] << 8 | b[2] << 16 | b[3] << 24; } else sys->this_node.zone = zone; sd_debug("zone id = %u", sys->this_node.zone); sys->this_node.space = sys->disk_space; sys->cinfo.epoch = get_latest_epoch(); if (sys->cinfo.epoch) { sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes, sizeof(sys->cinfo.nodes)); if (sys->cinfo.nr_nodes == -1) return -1; } sys->cinfo.status = SD_STATUS_WAIT; main_thread_set(pending_block_list, xzalloc(sizeof(struct list_head))); INIT_LIST_HEAD(main_thread_get(pending_block_list)); main_thread_set(pending_notify_list, xzalloc(sizeof(struct list_head))); INIT_LIST_HEAD(main_thread_get(pending_notify_list)); INIT_LIST_HEAD(&sys->local_req_queue); INIT_LIST_HEAD(&sys->req_wait_queue); ret = send_join_request(); if (ret != 0) return -1; return 0; } /* * We will call this function for two reason: * 1) make this node working as a gateway, or * 2) the program is going to shutdown itself. */ int leave_cluster(void) { static bool left; if (left) return 0; left = true; return sys->cdrv->leave(); } sheepdog-0.8.3/sheep/http/000077500000000000000000000000001237656255000154165ustar00rootroot00000000000000sheepdog-0.8.3/sheep/http/http.c000066400000000000000000000226051237656255000165460ustar00rootroot00000000000000/* * Copyright (C) 2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* This files implement RESTful interface to sheepdog storage via fastcgi */ #include "http.h" #include "sheep_priv.h" #include "option.h" static const char *http_host = "localhost"; static const char *http_port = "8000"; LIST_HEAD(http_drivers); static LIST_HEAD(http_enabled_drivers); static inline const char *stropcode(enum http_opcode opcode) { static const char *const descs[] = { [HTTP_GET] = "GET", [HTTP_PUT] = "PUT", [HTTP_POST] = "POST", [HTTP_DELETE] = "DELETE", [HTTP_HEAD] = "HEAD", }; if (descs[opcode] == NULL) { static __thread char msg[32]; snprintf(msg, sizeof(msg), "Invalid opcode %d", opcode); return msg; } return descs[opcode]; } static inline const char *strstatus(enum http_status status) { static const char *const descs[] = { [UNKNOWN] = "Unknown", [OK] = "200 OK", [CREATED] = "201 Created", [ACCEPTED] = "202 Accepted", [NO_CONTENT] = "204 No Content", [PARTIAL_CONTENT] = "206 Partial Content", [BAD_REQUEST] = "400 Bad Request", [UNAUTHORIZED] = "401 Unauthorized", [NOT_FOUND] = "404 Not Found", [METHOD_NOT_ALLOWED] = "405 Method Not Allowed", [CONFLICT] = "409 Conflict", [REQUEST_RANGE_NOT_SATISFIABLE] = "416 Requested Range Not Satisfiable", [INTERNAL_SERVER_ERROR] = "500 Internal Server Error", [NOT_IMPLEMENTED] = "501 Not Implemented", [SERVICE_UNAVAILABLE] = "503 Service_Unavailable", }; if (descs[status] == NULL) { static __thread char msg[32]; snprintf(msg, sizeof(msg), "Invalid Status %d", status); return msg; } return descs[status]; } const char *str_http_req(const struct http_request *req) { static __thread char msg[1024]; snprintf(msg, sizeof(msg), "%s %s, status = %s, data_length = %"PRIu64, req->uri, stropcode(req->opcode), strstatus(req->status), req->data_length); return msg; } struct http_work { struct work work; struct http_request *request; }; static inline void http_request_error(struct http_request *req) { int ret = FCGX_GetError(req->fcgx.out); if (ret == 0) return; else if (ret < 0) sd_err("failed, FCGI error %d", ret); else sd_err("failed, %s", strerror(ret)); } int http_request_write(struct http_request *req, const void *buf, int len) { int ret = FCGX_PutStr(buf, len, req->fcgx.out); if (ret < 0) http_request_error(req); return ret; } int http_request_read(struct http_request *req, void *buf, int len) { int ret = FCGX_GetStr(buf, len, req->fcgx.in); if (ret < 0) http_request_error(req); return ret; } int http_request_writes(struct http_request *req, const char *str) { int ret = FCGX_PutS(str, req->fcgx.out); if (ret < 0) http_request_error(req); return ret; } __printf(2, 3) int http_request_writef(struct http_request *req, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = FCGX_VFPrintF(req->fcgx.out, fmt, ap); va_end(ap); if (ret < 0) http_request_error(req); return ret; } static int request_init_operation(struct http_request *req) { char **env = req->fcgx.envp; char *p, *endp; p = FCGX_GetParam("REQUEST_METHOD", env); if (!strcmp(p, "PUT")) { req->opcode = HTTP_PUT; } else if (!strcmp(p, "GET")) { req->opcode = HTTP_GET; } else if (!strcmp(p, "POST")) { req->opcode = HTTP_POST; } else if (!strcmp(p, "DELETE")) { req->opcode = HTTP_DELETE; } else if (!strcmp(p, "HEAD")) { req->opcode = HTTP_HEAD; } else { return BAD_REQUEST; } p = FCGX_GetParam("CONTENT_LENGTH", env); if (p[0] != '\0') { req->data_length = strtoll(p, &endp, 10); if (p == endp) { sd_err("invalid content_length %s", p); return BAD_REQUEST; } } req->uri = FCGX_GetParam("DOCUMENT_URI", env); if (!req->uri) return BAD_REQUEST; p = FCGX_GetParam("HTTP_RANGE", env); if (p && p[0] != '\0') { const char prefix[] = "bytes="; char *left, *right, num[64]; uint64_t max; left = strstr(p, prefix); if (!p) goto invalid_range; right = strchr(left, '-'); strncpy(num, left + sizeof(prefix) - 1, right - left); req->offset = strtoll(num, &endp, 10); if (num == endp) goto invalid_range; strcpy(num, right + 1); /* * In swift spec, the second number of RANGE should be included * which means [num1, num2], but our common means for read and * write data by 'offset' and 'len' is [num1, num2), so we * should add 1 to num2. */ max = strtoll(num, &endp, 10) + 1; if (num == endp) goto invalid_range; if (max <= req->offset) goto invalid_range; req->data_length = max - req->offset; sd_debug("HTTP_RANGE: %"PRIu64" %"PRIu64, req->offset, max); } req->status = UNKNOWN; return OK; invalid_range: sd_err("invalid range %s", p); return REQUEST_RANGE_NOT_SATISFIABLE; } static int http_init_request(struct http_request *req) { char *p; for (int i = 0; (p = req->fcgx.envp[i]); ++i) sd_debug("%s", p); return request_init_operation(req); } /* This function does nothing if we have already printed a status code. */ void http_response_header(struct http_request *req, enum http_status status) { if (req->status != UNKNOWN) return; req->status = status; http_request_writef(req, "Status: %s\r\n", strstatus(status)); if (req->opcode == HTTP_GET || req->opcode == HTTP_HEAD) http_request_writef(req, "Content-Length: %"PRIu64"\r\n", req->data_length); http_request_writes(req, "Content-type: text/plain;\r\n\r\n"); } static void http_end_request(struct http_request *req) { FCGX_Finish_r(&req->fcgx); free(req); } static void http_run_request(struct work *work) { struct http_work *hw = container_of(work, struct http_work, work); struct http_request *req = hw->request; int op = req->opcode; struct http_driver *hdrv; list_for_each_entry(hdrv, &http_enabled_drivers, list) { void (*method)(struct http_request *req) = NULL; switch (op) { case HTTP_HEAD: method = hdrv->head; break; case HTTP_GET: method = hdrv->get; break; case HTTP_PUT: method = hdrv->put; break; case HTTP_POST: method = hdrv->post; break; case HTTP_DELETE: method = hdrv->delete; break; default: break; } if (method != NULL) { method(req); sd_debug("req->status %d", req->status); if (req->status != UNKNOWN) goto out; } } http_response_header(req, METHOD_NOT_ALLOWED); out: http_end_request(req); } static void http_request_done(struct work *work) { struct http_work *hw = container_of(work, struct http_work, work); free(hw); } static void http_queue_request(struct http_request *req) { struct http_work *hw = xmalloc(sizeof(*hw)); hw->work.fn = http_run_request; hw->work.done = http_request_done; hw->request = req; queue_work(sys->http_wqueue, &hw->work); } static inline struct http_request *http_new_request(int sockfd) { struct http_request *req = xzalloc(sizeof(*req)); FCGX_InitRequest(&req->fcgx, sockfd, 0); return req; } static int http_sockfd; static void *http_main_loop(void *ignored) { int err; for (;;) { struct http_request *req = http_new_request(http_sockfd); int ret; ret = FCGX_Accept_r(&req->fcgx); if (ret < 0) { sd_err("accept failed, %d, %d", http_sockfd, ret); goto out; } ret = http_init_request(req); if (ret != OK) { http_response_header(req, ret); http_end_request(req); continue; } http_queue_request(req); } out: err = pthread_detach(pthread_self()); if (err) sd_err("%s", strerror(err)); pthread_exit(NULL); } static int http_opt_host_parser(const char *s) { http_host = s; return 0; } static int http_opt_port_parser(const char *s) { http_port = s; return 0; } static int http_opt_default_parser(const char *s) { struct http_driver *hdrv; hdrv = find_hdrv(&http_enabled_drivers, s); if (hdrv != NULL) { sd_err("%s driver is already enabled", hdrv->name); return -1; } hdrv = find_hdrv(&http_drivers, s); if (hdrv == NULL) { sd_err("'%s' is not a valid driver name", s); return -1; } if (hdrv->init(get_hdrv_option(hdrv, s)) < 0) { sd_err("failed to initialize %s driver", hdrv->name); return -1; } list_move_tail(&hdrv->list, &http_enabled_drivers); return 0; } static struct option_parser http_opt_parsers[] = { { "host=", http_opt_host_parser }, { "port=", http_opt_port_parser }, { "", http_opt_default_parser }, { NULL, NULL }, }; int http_init(const char *options) { pthread_t t; int err; char *s, address[HOST_NAME_MAX + 8]; s = strdup(options); if (s == NULL) { sd_emerg("OOM"); return -1; } if (option_parse(s, ",", http_opt_parsers) < 0) return -1; if (list_empty(&http_enabled_drivers)) { http_opt_default_parser("swift"); sd_debug("Use swift as default http driver"); } sys->http_wqueue = create_work_queue("http", WQ_DYNAMIC); if (!sys->http_wqueue) return -1; FCGX_Init(); #define LISTEN_QUEUE_DEPTH 1024 /* No rationale */ snprintf(address, sizeof(address), "%s:%s", http_host, http_port); http_sockfd = FCGX_OpenSocket(address, LISTEN_QUEUE_DEPTH); if (http_sockfd < 0) { sd_err("open socket failed, address %s", address); return -1; } sd_info("http service listen at %s", address); err = pthread_create(&t, NULL, http_main_loop, NULL); if (err) { sd_err("%s", strerror(err)); return -1; } return 0; } sheepdog-0.8.3/sheep/http/http.h000066400000000000000000000111741237656255000165520ustar00rootroot00000000000000/* * Copyright (C) 2013 MORITA Kazutaka * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SHEEP_HTTP_H__ #define __SHEEP_HTTP_H__ #include #include "sheepdog_proto.h" #include "sheep.h" enum http_opcode { HTTP_GET = 1, HTTP_PUT, HTTP_POST, HTTP_DELETE, HTTP_HEAD, }; enum http_status { UNKNOWN = 0, OK, /* 200 */ CREATED, /* 201 */ ACCEPTED, /* 202 */ NO_CONTENT, /* 204 */ PARTIAL_CONTENT, /* 206 */ BAD_REQUEST, /* 400 */ UNAUTHORIZED, /* 401 */ NOT_FOUND, /* 404 */ METHOD_NOT_ALLOWED, /* 405 */ CONFLICT, /* 409 */ REQUEST_RANGE_NOT_SATISFIABLE, /* 416 */ INTERNAL_SERVER_ERROR, /* 500 */ NOT_IMPLEMENTED, /* 501 */ SERVICE_UNAVAILABLE, /* 503 */ }; struct http_request { FCGX_Request fcgx; char *uri; enum http_opcode opcode; enum http_status status; uint64_t data_length; uint64_t offset; }; struct http_driver { const char *name; /* Returns zero on success, -1 on error. */ int (*init)(const char *option); void (*head)(struct http_request *req); void (*get)(struct http_request *req); void (*put)(struct http_request *req); void (*post)(struct http_request *req); void (*delete)(struct http_request *req); struct list_node list; }; extern struct list_head http_drivers; #define hdrv_register(driver) \ static void __attribute__((constructor)) register_ ## driver(void) \ { \ list_add(&driver.list, &http_drivers); \ } static inline struct http_driver *find_hdrv(struct list_head *drivers, const char *name) { struct http_driver *hdrv; int len; list_for_each_entry(hdrv, drivers, list) { len = strlen(hdrv->name); if (strncmp(hdrv->name, name, len) == 0 && (name[len] == ':' || name[len] == '\0')) return hdrv; } return NULL; } static inline const char *get_hdrv_option(const struct http_driver *hdrv, const char *arg) { int len = strlen(hdrv->name); if (arg[len] == ':') return strdup(arg + len + 1); else return NULL; } const char *str_http_req(const struct http_request *req); void http_response_header(struct http_request *req, enum http_status status); int http_request_read(struct http_request *req, void *buf, int len); int http_request_write(struct http_request *req, const void *buf, int len); int http_request_writes(struct http_request *req, const char *str); __printf(2, 3) int http_request_writef(struct http_request *req, const char *fmt, ...); /* For kv.c */ #define SD_MAX_BUCKET_NAME 256 #define SD_MAX_OBJECT_NAME 1024 /* Account operations */ int kv_create_account(const char *account); int kv_read_account_meta(struct http_request *req, const char *account); int kv_update_account(const char *account); int kv_delete_account(struct http_request *req, const char *account); int kv_list_accounts(struct http_request *req, void (*cb)(struct http_request *req, const char *account, void *opaque), void *opaque); /* Bucket operations */ int kv_create_bucket(const char *account, const char *bucket); int kv_read_bucket(struct http_request *req, const char *account, const char *bucket); int kv_update_bucket(const char *account, const char *bucket); int kv_delete_bucket(const char *account, const char *bucket); int kv_iterate_bucket(const char *account, void (*cb)(const char *bucket, void *opaque), void *opaque); /* Object operations */ int kv_create_object(struct http_request *req, const char *account, const char *bucket, const char *object); int kv_read_object(struct http_request *req, const char *account, const char *bucket, const char *object); int kv_read_object_meta(struct http_request *req, const char *account, const char *bucket, const char *object); int kv_delete_object(const char *account, const char *bucket, const char *); int kv_iterate_object(const char *account, const char *bucket, void (*cb)(const char *object, void *opaque), void *opaque); /* object_allocator.c */ int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count); int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count); int oalloc_free(uint32_t vid, uint64_t start, uint64_t count); int oalloc_init(uint32_t vid); #endif /* __SHEEP_HTTP_H__ */ sheepdog-0.8.3/sheep/http/kv.c000066400000000000000000000730561237656255000162150ustar00rootroot00000000000000/* * Copyright (C) 2013 MORITA Kazutaka * Copyright (C) 2013 Robin Dong * Copyright (C) 2013 Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* This file implements backend kv functions for object storage. */ #include "sheep_priv.h" #include "http.h" struct kv_bnode { char name[SD_MAX_BUCKET_NAME]; uint64_t object_count; uint64_t bytes_used; uint64_t oid; }; struct onode_extent { uint64_t start; uint64_t count; }; struct kv_onode { union { struct { char name[SD_MAX_OBJECT_NAME]; /* a hash value for etag */ uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)]; uint64_t size; uint64_t mtime; uint32_t data_vid; uint32_t nr_extent; uint64_t oid; uint8_t inlined; }; uint8_t pad[BLOCK_SIZE]; }; union { uint8_t data[SD_DATA_OBJ_SIZE - BLOCK_SIZE]; struct onode_extent o_extent[0]; }; }; typedef void (*bucket_iter_cb)(const char *bucket, void *opaque); struct bucket_iterater_arg { void *opaque; bucket_iter_cb cb; uint64_t bucket_count; uint64_t object_count; uint64_t bytes_used; }; static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; char buf[SD_MAX_VDI_LEN] = {0}; pstrcpy(buf, SD_MAX_VDI_LEN, name); sd_init_req(&hdr, SD_OP_NEW_VDI); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = SD_MAX_VDI_LEN; hdr.vdi.vdi_size = SD_MAX_VDI_SIZE; hdr.vdi.copies = sys->cinfo.nr_copies; hdr.vdi.copy_policy = sys->cinfo.copy_policy; hdr.vdi.store_policy = 1; ret = exec_local_req(&hdr, buf); if (rsp->result != SD_RES_SUCCESS) sd_err("Failed to create VDI %s: %s", name, sd_strerror(rsp->result)); if (vdi_id) *vdi_id = rsp->vdi.vdi_id; return ret; } /* Account operations */ /* * Account can have unlimited buckets, each of which can contain unlimited user * KV objects. * * For a URI such as /$account/$bucket/$object: * * kv_bnode helps us find the desired bucket by $bucket * | * V * $account --> [bucket1, bucket2, bucket3, ...] * | * | kv_onode helps us find the desired object by $object * V * [object1, object2, ...] * * We assign a hyper volume for each account to hold the kv_bnodes(bucket index * node), each of which point to a bucket(also a hyper volume), into which we * store kv_onodes, that maps to user kv data objects. */ int kv_create_account(const char *account) { uint32_t vdi_id; return kv_create_hyper_volume(account, &vdi_id); } static void bucket_iterater(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct bucket_iterater_arg *biarg = arg; struct kv_bnode bnode; uint64_t oid; int ret; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (!ext->vdi_id) return; oid = vid_to_data_oid(ext->vdi_id, ext->idx); ret = sd_read_object(oid, (char *)&bnode, sizeof(bnode), 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read data object %"PRIx64, oid); return; } if (bnode.name[0] == 0) return; if (biarg->cb) biarg->cb(bnode.name, biarg->opaque); biarg->bucket_count++; biarg->object_count += bnode.object_count; biarg->bytes_used += bnode.bytes_used; } } static int read_account_meta(const char *account, uint64_t *bucket_count, uint64_t *object_count, uint64_t *used) { struct sd_inode *inode = NULL; struct bucket_iterater_arg arg = {}; uint32_t account_vid; uint64_t oid; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) goto out; oid = vid_to_vdi_oid(account_vid); inode = xmalloc(sizeof(*inode)); ret = sd_read_object(oid, (char *)inode, sizeof(struct sd_inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read inode header %"PRIx64, oid); goto out; } traverse_btree(sheep_bnode_reader, inode, bucket_iterater, &arg); *object_count = arg.object_count; *bucket_count = arg.bucket_count; *used = arg.bytes_used; out: free(inode); return ret; } int kv_read_account_meta(struct http_request *req, const char *account) { uint64_t bcount, ocount, used; int ret; ret = read_account_meta(account, &bcount, &ocount, &used); if (ret != SD_RES_SUCCESS) return ret; http_request_writef(req, "X-Account-Container-Count: %"PRIu64"\n", bcount); http_request_writef(req, "X-Account-Object-Count: %"PRIu64"\n", ocount); http_request_writef(req, "X-Account-Bytes-Used: %"PRIu64"\n", used); return ret; } int kv_update_account(const char *account) { /* TODO: update metadata of the account */ return -1; } int kv_delete_account(struct http_request *req, const char *account) { uint64_t bcount, ocount, used; int ret; ret = read_account_meta(account, &bcount, &ocount, &used); if (ret != SD_RES_SUCCESS) return ret; if (bcount) return SD_RES_VDI_NOT_EMPTY; ret = sd_delete_vdi(account); if (ret != SD_RES_SUCCESS) sd_err("Failed to delete vdi %s", account); return ret; } /* Bucket operations */ /* * We use two VDIs to implement bucket abstraction: one stores 'struct kv_onode' * which is an index node for user data and the other actually stores kv data, * which use oalloc.c to manage free space. * * The first vdi is named as "$account/$bucket" and the second vdi as * "$account/$bucket/allocator". * * For example: bucket "fruit" with account 'coly' has two objects "banana" * and "apple" * * Account: coly * +-----------------------+ * | kv_bnode: fruit | ... | <--- account_vid * +-----------------------+ * | +--------------------- kv_onode ---------------------+ * | | | * \ v v * \ +---------------------------------------------------------+ * bucket_vdi \---> |coly/fruit | ... | kv_onode: banana | kv_onode: apple | * +---------------------------------------------------------+ * | | * oalloc.c manages allocation and deallocation | | * v v * +---------------------------+---+-----------------+ * data_vid |coly/fruit/allocator |...| data | * +---------------------------+---+-----------------+ */ static int bnode_do_create(struct kv_bnode *bnode, struct sd_inode *inode, uint32_t idx) { uint32_t vid = inode->vdi_id; uint64_t oid = vid_to_data_oid(vid, idx); int ret; bnode->oid = oid; ret = sd_write_object(oid, (char *)bnode, sizeof(*bnode), 0, true); if (ret != SD_RES_SUCCESS) { sd_err("failed to create object, %" PRIx64, oid); goto out; } INODE_SET_VID(inode, idx, vid); ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64, vid_to_vdi_oid(vid)); goto out; } out: return ret; } static int bnode_create(struct kv_bnode *bnode, uint32_t account_vid) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); uint32_t tmp_vid, idx; uint64_t hval, i; int ret; ret = sd_read_object(vid_to_vdi_oid(account_vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read %" PRIx32 " %s", account_vid, sd_strerror(ret)); goto out; } hval = sd_hash(bnode->name, strlen(bnode->name)); for (i = 0; i < MAX_DATA_OBJS; i++) { idx = (hval + i) % MAX_DATA_OBJS; tmp_vid = INODE_GET_VID(inode, idx); if (tmp_vid) continue; else break; } if (i == MAX_DATA_OBJS) { ret = SD_RES_NO_SPACE; goto out; } ret = bnode_do_create(bnode, inode, idx); out: free(inode); return ret; } static int bucket_create(const char *account, uint32_t account_vid, const char *bucket) { char onode_name[SD_MAX_VDI_LEN]; char alloc_name[SD_MAX_VDI_LEN]; struct kv_bnode bnode; uint32_t vid; int ret; snprintf(onode_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = kv_create_hyper_volume(onode_name, &vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to create bucket %s onode vid", bucket); return ret; } snprintf(alloc_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket); ret = kv_create_hyper_volume(alloc_name, &vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to create bucket %s data vid", bucket); sd_delete_vdi(onode_name); return ret; } ret = oalloc_init(vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to init allocator for bucket %s", bucket); goto err; } pstrcpy(bnode.name, sizeof(bnode.name), bucket); bnode.bytes_used = 0; bnode.object_count = 0; ret = bnode_create(&bnode, account_vid); if (ret != SD_RES_SUCCESS) goto err; return SD_RES_SUCCESS; err: sd_delete_vdi(onode_name); sd_delete_vdi(alloc_name); return ret; } static int bnode_lookup(struct kv_bnode *bnode, uint32_t vid, const char *name) { uint64_t hval, i; int ret; hval = sd_hash(name, strlen(name)); for (i = 0; i < MAX_DATA_OBJS; i++) { uint32_t idx = (hval + i) % MAX_DATA_OBJS; uint64_t oid = vid_to_data_oid(vid, idx); ret = sd_read_object(oid, (char *)bnode, sizeof(*bnode), 0); if (ret != SD_RES_SUCCESS) goto out; if (strcmp(bnode->name, name) == 0) break; } if (i == MAX_DATA_OBJS) ret = SD_RES_NO_OBJ; out: return ret; } /* * For object create/delete, we can't easily maintain the bnode consistent by * playing around the operations order. * * We should inform the user the deletion failure if bnode_update() fails even * though we might delete the onode successfully. Then subsequent 'delete' for * the same object won't skew up the bnode metadata. * The true fix for the inconsistency (for whatever reaons it happens), is a * check request that does a server side consistency check. This is left for a * future patch. * * Alternative fix is that we drop the redundant data about bytes_used, * object_counts from bnode, and so for "HEAD" operation, we just iterate all * the objects. This can't scale if we have huge objects. */ static int bnode_update(const char *account, const char *bucket, uint64_t used, bool create) { uint32_t account_vid; struct kv_bnode bnode; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to find account %s", account); return ret; } ret = bnode_lookup(&bnode, account_vid, bucket); if (ret != SD_RES_SUCCESS) return ret; if (create) { bnode.object_count++; bnode.bytes_used += used; } else { bnode.object_count--; bnode.bytes_used -= used; } ret = sd_write_object(bnode.oid, (char *)&bnode, sizeof(bnode), 0, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to update bnode for %s", bucket); return ret; } return SD_RES_SUCCESS; } static int bucket_delete(const char *account, uint32_t avid, const char *bucket) { struct kv_bnode bnode; char onode_name[SD_MAX_VDI_LEN]; char alloc_name[SD_MAX_VDI_LEN]; int ret; snprintf(onode_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); snprintf(alloc_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket); ret = bnode_lookup(&bnode, avid, bucket); if (ret != SD_RES_SUCCESS) return ret; if (bnode.object_count > 0) return SD_RES_VDI_NOT_EMPTY; ret = sd_discard_object(bnode.oid); if (ret != SD_RES_SUCCESS) { sd_err("failed to discard bnode for %s", bucket); return ret; } sd_delete_vdi(onode_name); sd_delete_vdi(alloc_name); return SD_RES_SUCCESS; } typedef void (*object_iter_cb)(const char *object, void *opaque); struct object_iterater_arg { void *opaque; object_iter_cb cb; uint32_t count; }; static void object_iterater(void *data, enum btree_node_type type, void *arg) { struct sd_extent *ext; struct object_iterater_arg *oiarg = arg; struct kv_onode *onode = NULL; uint64_t oid; int ret; if (type == BTREE_EXT) { ext = (struct sd_extent *)data; if (!ext->vdi_id) goto out; onode = xmalloc(SD_DATA_OBJ_SIZE); oid = vid_to_data_oid(ext->vdi_id, ext->idx); ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read data object %"PRIx64, oid); goto out; } if (onode->name[0] == '\0') goto out; if (oiarg->cb) oiarg->cb(onode->name, oiarg->opaque); oiarg->count++; } out: free(onode); } static int bucket_iterate_object(uint32_t bucket_vid, object_iter_cb cb, void *opaque) { struct object_iterater_arg arg = {opaque, cb, 0}; struct sd_inode *inode; int ret; inode = xmalloc(sizeof(*inode)); ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode, sizeof(struct sd_inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode %s", sd_strerror(ret)); goto out; } traverse_btree(sheep_bnode_reader, inode, object_iterater, &arg); out: free(inode); return ret; } int kv_create_bucket(const char *account, const char *bucket) { uint32_t account_vid, vid; char vdi_name[SD_MAX_VDI_LEN]; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to find account %s", account); return ret; } sys->cdrv->lock(account_vid); snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &vid); if (ret == SD_RES_SUCCESS) { sd_err("bucket %s is exists.", bucket); ret = SD_RES_VDI_EXIST; goto out; } if (ret != SD_RES_NO_VDI) goto out; ret = bucket_create(account, account_vid, bucket); out: sys->cdrv->unlock(account_vid); return ret; } int kv_read_bucket(struct http_request *req, const char *account, const char *bucket) { uint32_t account_vid; struct kv_bnode bnode; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to find account %s", account); return ret; } ret = bnode_lookup(&bnode, account_vid, bucket); if (ret != SD_RES_SUCCESS) goto out; http_request_writef(req, "X-Container-Object-Count: %"PRIu64"\n", bnode.object_count); http_request_writef(req, "X-Container-Bytes-Used: %"PRIu64"\n", bnode.bytes_used); out: return ret; } int kv_update_bucket(const char *account, const char *bucket) { /* TODO: update metadata of the bucket */ return -1; } /* return SD_RES_NO_VDI if bucket is not existss */ int kv_delete_bucket(const char *account, const char *bucket) { uint32_t account_vid, vid; char vdi_name[SD_MAX_VDI_LEN]; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to find account %s", account); return ret; } sys->cdrv->lock(account_vid); snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &vid); if (ret != SD_RES_SUCCESS) goto out; ret = bucket_delete(account, account_vid, bucket); out: sys->cdrv->unlock(account_vid); return ret; } int kv_iterate_bucket(const char *account, bucket_iter_cb cb, void *opaque) { struct sd_inode account_inode; struct bucket_iterater_arg arg = {opaque, cb, 0, 0, 0}; uint32_t account_vid; uint64_t oid; int ret; ret = sd_lookup_vdi(account, &account_vid); if (ret != SD_RES_SUCCESS) { sd_err("Failed to find account %s", account); return ret; } oid = vid_to_vdi_oid(account_vid); sys->cdrv->lock(account_vid); ret = sd_read_object(oid, (char *)&account_inode, sizeof(struct sd_inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read account inode header %s", account); goto out; } traverse_btree(sheep_bnode_reader, &account_inode, bucket_iterater, &arg); out: sys->cdrv->unlock(account_vid); return ret; } /* Object operations */ #define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - BLOCK_SIZE) static int vdi_read_write(uint32_t vid, char *data, size_t length, off_t offset, bool is_read) { struct sd_req hdr; uint32_t idx = offset / SD_DATA_OBJ_SIZE; uint64_t done = 0; struct request_iocb *iocb; int ret; iocb = local_req_init(); if (!iocb) return SD_RES_SYSTEM_ERROR; offset %= SD_DATA_OBJ_SIZE; while (done < length) { size_t len = min(length - done, SD_DATA_OBJ_SIZE - offset); if (is_read) { sd_init_req(&hdr, SD_OP_READ_OBJ); } else { sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; } hdr.data_length = len; hdr.obj.oid = vid_to_data_oid(vid, idx); hdr.obj.offset = offset; ret = exec_local_req_async(&hdr, data, iocb); if (ret != SD_RES_SUCCESS) sd_err("failed to write object %" PRIx64 ", %s", hdr.obj.oid, sd_strerror(ret)); offset += len; if (offset == SD_DATA_OBJ_SIZE) { offset = 0; idx++; } done += len; data += len; } return local_req_wait(iocb); } #define MAX_RW_BUFFER (SD_DATA_OBJ_SIZE * 25) /* No rationale yet */ static int onode_populate_extents(struct kv_onode *onode, struct http_request *req) { ssize_t size; uint64_t start = 0, count, done = 0, total, offset; int ret; char *data_buf = NULL; uint32_t data_vid = onode->data_vid; uint64_t write_buffer_size = MIN(MAX_RW_BUFFER, req->data_length); count = DIV_ROUND_UP(req->data_length, SD_DATA_OBJ_SIZE); sys->cdrv->lock(data_vid); ret = oalloc_new_prepare(data_vid, &start, count); sys->cdrv->unlock(data_vid); if (ret != SD_RES_SUCCESS) { sd_err("oalloc_new_prepare failed for %s, %s", onode->name, sd_strerror(ret)); goto out; } data_buf = xmalloc(write_buffer_size); offset = start * SD_DATA_OBJ_SIZE; total = req->data_length; while (done < total) { size = http_request_read(req, data_buf, write_buffer_size); ret = vdi_read_write(data_vid, data_buf, size, offset, false); if (ret != SD_RES_SUCCESS) { sd_err("Failed to write data object for %s, %s", onode->name, sd_strerror(ret)); goto out; } done += size; offset += size; } sys->cdrv->lock(data_vid); ret = oalloc_new_finish(data_vid, start, count); sys->cdrv->unlock(data_vid); if (ret != SD_RES_SUCCESS) { sd_err("oalloc_new_finish failed for %s, %s", onode->name, sd_strerror(ret)); goto out; } onode->o_extent[0].start = start; onode->o_extent[0].count = count; onode->nr_extent = 1; out: free(data_buf); return ret; } static uint64_t get_seconds(void) { struct timeval tv; uint64_t seconds; gettimeofday(&tv, NULL); seconds = (uint64_t)tv.tv_sec; return seconds; } static int onode_populate_data(struct kv_onode *onode, struct http_request *req) { ssize_t size; int ret = SD_RES_SUCCESS; if (req->data_length <= KV_ONODE_INLINE_SIZE) { onode->inlined = 1; size = http_request_read(req, onode->data, sizeof(onode->data)); if (size < 0 || req->data_length != size) { sd_err("Failed to read from web server for %s", onode->name); ret = SD_RES_SYSTEM_ERROR; goto out; } } else { ret = onode_populate_extents(onode, req); if (ret != SD_RES_SUCCESS) goto out; } onode->mtime = get_seconds(); onode->size = req->data_length; out: return ret; } static int onode_do_create(struct kv_onode *onode, struct sd_inode *inode, uint32_t idx, bool create) { uint32_t vid = inode->vdi_id; uint64_t oid = vid_to_data_oid(vid, idx), len; int ret; onode->oid = oid; if (onode->inlined) len = onode->size; else len = sizeof(struct onode_extent) * onode->nr_extent; ret = sd_write_object(oid, (char *)onode, BLOCK_SIZE + len, 0, create); if (ret != SD_RES_SUCCESS) { sd_err("failed to create object, %" PRIx64, oid); goto out; } if (!create) goto out; INODE_SET_VID(inode, idx, vid); ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64, vid_to_vdi_oid(vid)); goto out; } out: return ret; } static int onode_create(struct kv_onode *onode, uint32_t bucket_vid) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); uint32_t tmp_vid, idx; uint64_t hval, i; int ret; bool create = true; sys->cdrv->lock(bucket_vid); ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read %" PRIx32 " %s", bucket_vid, sd_strerror(ret)); goto out; } hval = sd_hash(onode->name, strlen(onode->name)); for (i = 0; i < MAX_DATA_OBJS; i++) { idx = (hval + i) % MAX_DATA_OBJS; tmp_vid = INODE_GET_VID(inode, idx); if (tmp_vid) { uint64_t oid = vid_to_data_oid(bucket_vid, idx); char name[SD_MAX_OBJECT_NAME] = { }; ret = sd_read_object(oid, name, sizeof(name), 0); if (ret != SD_RES_SUCCESS) goto out; if (name[0] == 0) { create = false; goto create; } } else break; } if (i == MAX_DATA_OBJS) { ret = SD_RES_NO_SPACE; goto out; } create: ret = onode_do_create(onode, inode, idx, create); out: free(inode); sys->cdrv->unlock(bucket_vid); return ret; } static int onode_free_data(struct kv_onode *onode) { uint32_t data_vid = onode->data_vid; int ret; sys->cdrv->lock(data_vid); ret = oalloc_free(data_vid, onode->o_extent[0].start, onode->o_extent[0].count); sys->cdrv->unlock(data_vid); if (ret != SD_RES_SUCCESS) sd_err("failed to free %s", onode->name); return ret; } static int onode_read_extents(struct kv_onode *onode, struct http_request *req) { struct onode_extent *ext; uint64_t size, total, total_size, offset, done = 0, i, ext_len; uint64_t off = req->offset, len = req->data_length; int ret; char *data_buf = NULL; uint64_t read_buffer_size = MIN(MAX_RW_BUFFER, onode->size); data_buf = xmalloc(read_buffer_size); total_size = len; for (i = 0; i < onode->nr_extent; i++) { ext = onode->o_extent + i; ext_len = ext->count * SD_DATA_OBJ_SIZE; if (off >= ext_len) { off -= ext_len; continue; } total = min(ext_len - off, total_size); offset = ext->start * SD_DATA_OBJ_SIZE + off; off = 0; done = 0; while (done < total) { size = MIN(total - done, read_buffer_size); ret = vdi_read_write(onode->data_vid, data_buf, size, offset, true); sd_debug("vdi_read_write size: %"PRIx64", offset: %" PRIx64, size, offset); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read for vid %"PRIx32, onode->data_vid); goto out; } http_request_write(req, data_buf, size); done += size; offset += size; total_size -= size; } } out: free(data_buf); return ret; } /* * Check if object by name exists in a bucket and init 'onode' if it exists. * * Return SD_RES_SUCCESS if found, SD_RES_NO_OBJ if not found. * * We check adjacent objects one by one once we get a start index by hashing * name. Unallocated slot marks the end of the check window. * * For e.g, if we are going to check if fish in the following bucket, assume * fish hashes to 'sheep', so we compare the name one by one from 'sheep' to * 'fish'. '\0' indicates that object was deleted before checking. * * [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ] */ static int onode_lookup(struct kv_onode *onode, uint32_t ovid, const char *name) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); uint32_t tmp_vid, idx; uint64_t hval, i; int ret; sys->cdrv->lock(ovid); ret = sd_read_object(vid_to_vdi_oid(ovid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read %" PRIx32 " %s", ovid, sd_strerror(ret)); goto out; } hval = sd_hash(name, strlen(name)); for (i = 0; i < MAX_DATA_OBJS; i++) { idx = (hval + i) % MAX_DATA_OBJS; tmp_vid = INODE_GET_VID(inode, idx); if (tmp_vid) { uint64_t oid = vid_to_data_oid(ovid, idx); ret = sd_read_object(oid, (char *)onode, sizeof(*onode), 0); if (ret != SD_RES_SUCCESS) goto out; if (strcmp(onode->name, name) == 0) break; } else { ret = SD_RES_NO_OBJ; break; } } if (i == MAX_DATA_OBJS) { ret = SD_RES_NO_OBJ; goto out; } out: free(inode); sys->cdrv->unlock(ovid); return ret; } static int onode_read_data(struct kv_onode *onode, struct http_request *req) { int ret; uint64_t off = 0, len = onode->size; if (req->offset || req->data_length) { off = req->offset; len = req->data_length; if ((off + len - 1) > onode->size) { if (onode->size > off) len = onode->size - off; else len = 0; } } req->data_length = len; if (!len) return SD_RES_INVALID_PARMS; http_response_header(req, OK); if (!onode->inlined) return onode_read_extents(onode, req); ret = http_request_write(req, onode->data + off, len); if (ret != len) return SD_RES_SYSTEM_ERROR; return SD_RES_SUCCESS; } /* * We free the data and meta data in following sequence: * * 1. zero onode * - we can't discard it because onode_lookup() need it to find if some object * exists or not by checking adjacent objects * 2. discard data * * If (1) success, we consdier it a successful deletion of user object. If (2) * fails, data objects become orphan(s). * * XXX: GC the orphans */ static int onode_delete(struct kv_onode *onode) { char name[SD_MAX_OBJECT_NAME] = {}; int ret; ret = sd_write_object(onode->oid, name, sizeof(name), 0, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to zero onode for %s", onode->name); return ret; } ret = onode_free_data(onode); if (ret != SD_RES_SUCCESS) sd_err("failed to free data for %s", onode->name); return SD_RES_SUCCESS; } /* * user object name -> struct kv_onode -> sheepdog objects -> user data * * onode is a index node that maps name to sheepdog objects which hold the user * data, similar to UNIX inode. We use simple hashing for [name, onode] mapping. */ int kv_create_object(struct http_request *req, const char *account, const char *bucket, const char *name) { char vdi_name[SD_MAX_VDI_LEN]; struct kv_onode *onode; uint32_t bucket_vid, data_vid; int ret; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &bucket_vid); if (ret != SD_RES_SUCCESS) return ret; onode = xzalloc(sizeof(*onode)); ret = onode_lookup(onode, bucket_vid, name); if (ret == SD_RES_SUCCESS) { /* For overwrite, we delete old object and then create */ ret = kv_delete_object(account, bucket, name); if (ret != SD_RES_SUCCESS) { sd_err("Failed to delete exists object %s", name); goto out; } } else if (ret != SD_RES_NO_OBJ) goto out; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket); ret = sd_lookup_vdi(vdi_name, &data_vid); if (ret != SD_RES_SUCCESS) goto out; memset(onode, 0, sizeof(*onode)); pstrcpy(onode->name, sizeof(onode->name), name); onode->data_vid = data_vid; ret = onode_populate_data(onode, req); if (ret != SD_RES_SUCCESS) { sd_err("failed to write data for %s", name); goto out; } ret = onode_create(onode, bucket_vid); if (ret != SD_RES_SUCCESS) { sd_err("failed to create onode for %s", name); onode_free_data(onode); goto out; } ret = bnode_update(account, bucket, req->data_length, true); if (ret != SD_RES_SUCCESS) { sd_err("failed to update bucket for %s", name); onode_delete(onode); goto out; } out: free(onode); return ret; } int kv_read_object(struct http_request *req, const char *account, const char *bucket, const char *name) { struct kv_onode *onode = NULL; char vdi_name[SD_MAX_VDI_LEN]; uint32_t bucket_vid; int ret; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &bucket_vid); if (ret != SD_RES_SUCCESS) return ret; onode = xzalloc(sizeof(*onode)); ret = onode_lookup(onode, bucket_vid, name); if (ret != SD_RES_SUCCESS) goto out; ret = onode_read_data(onode, req); if (ret != SD_RES_SUCCESS) sd_err("failed to read data for %s", name); out: free(onode); return ret; } int kv_delete_object(const char *account, const char *bucket, const char *name) { char vdi_name[SD_MAX_VDI_LEN]; uint32_t bucket_vid; struct kv_onode *onode = NULL; int ret; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &bucket_vid); if (ret != SD_RES_SUCCESS) return ret; onode = xzalloc(sizeof(*onode)); ret = onode_lookup(onode, bucket_vid, name); if (ret != SD_RES_SUCCESS) goto out; ret = onode_delete(onode); if (ret != SD_RES_SUCCESS) { sd_err("failed to delete bnode for %s", name); goto out; } ret = bnode_update(account, bucket, onode->size, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update bnode for %s", name); goto out; } out: free(onode); return ret; } int kv_iterate_object(const char *account, const char *bucket, object_iter_cb cb, void *opaque) { char vdi_name[SD_MAX_VDI_LEN]; uint32_t bucket_vid; int ret; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &bucket_vid); if (ret != SD_RES_SUCCESS) return ret; sys->cdrv->lock(bucket_vid); ret = bucket_iterate_object(bucket_vid, cb, opaque); sys->cdrv->unlock(bucket_vid); return ret; } static char *http_time(uint64_t time_sec) { static __thread char time_str[128]; strftime(time_str, sizeof(time_str), "%a, %d %b %Y %H:%M:%S GMT", gmtime((time_t *)&time_sec)); return time_str; } int kv_read_object_meta(struct http_request *req, const char *account, const char *bucket, const char *name) { struct kv_onode *onode = NULL; char vdi_name[SD_MAX_VDI_LEN]; uint32_t bucket_vid; int ret; snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); ret = sd_lookup_vdi(vdi_name, &bucket_vid); if (ret != SD_RES_SUCCESS) return ret; onode = xzalloc(sizeof(*onode)); ret = onode_lookup(onode, bucket_vid, name); if (ret != SD_RES_SUCCESS) goto out; req->data_length = onode->size; http_request_writef(req, "Last-Modified: %s\n", http_time(onode->mtime)); out: free(onode); return ret; } sheepdog-0.8.3/sheep/http/oalloc.c000066400000000000000000000176651237656255000170520ustar00rootroot00000000000000/* * Copyright (C) 2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #include "http.h" /* * Meta Object tracks the free information of data vdi for the object allocation * in a free list. Free list is a redundant structure for bitmap for faster * allocation. * * +-------------------------------+ * | | * | sorted list v------v * +--------------------------------+-----------------------+ +--------+ * | Header | fd1 | fd2 | ... | fdN | .... object data .... | <-- | bitmap | * +--------------------------------+-----------------------+ +--------- * |<-- 4M -->| * * Best-fit algorithm for allocation and merge and sort the free list at * deallocation. One simple sorted list is effecient enough for extent based * invariable user object. * * XXX: Add allocation group for scalability and solve the meta size limitation */ struct header { uint64_t used; uint64_t nr_free; }; struct free_desc { uint64_t start; uint64_t count; }; static inline uint32_t oalloc_meta_length(struct header *hd) { return sizeof(struct header) + sizeof(struct free_desc) * hd->nr_free; } #define HEADER_TO_FREE_DESC(hd) ((struct free_desc *) \ ((char *)hd + sizeof(struct header))) #define MAX_FREE_DESC ((SD_DATA_OBJ_SIZE - sizeof(struct header)) / \ sizeof(struct free_desc)) /* * Initialize the data vdi * * @vid: the vdi where the allocator resides */ int oalloc_init(uint32_t vid) { struct strbuf buf = STRBUF_INIT; struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); struct header hd = { .nr_free = 1, }; struct free_desc fd = { .start = 1, /* Use first object as the meta object */ .count = MAX_DATA_OBJS - 1, }; int ret; strbuf_add(&buf, &hd, sizeof(hd)); strbuf_add(&buf, &fd, sizeof(fd)); ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode, %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf, buf.len, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("failed to create meta object for %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } INODE_SET_VID(inode, 0, vid); ret = sd_inode_write_vid(sheep_bnode_writer, inode, 0, vid, vid, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx32", %s", vid, sd_strerror(ret)); goto out; } out: strbuf_release(&buf); free(inode); return ret; } /* * Allocate the objects and upate the free list. * * Callers are expected to call oalloc_new_finish() to update the inode bitmap * after filling up the data. * * @vid: the vdi where the allocator resides * @start: start index of the objects to allocate * @count: number of the objects to allocate */ int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count) { char *meta = xvalloc(SD_DATA_OBJ_SIZE); struct header *hd; struct free_desc *fd; uint64_t oid = vid_to_data_oid(vid, 0), i; int ret; ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read meta %" PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } hd = (struct header *)meta; fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); for (i = 0; i < hd->nr_free; i++, fd--) { sd_debug("start %"PRIu64", count %"PRIu64, fd->start, fd->count); if (fd->count > count) break; } if (i == hd->nr_free) { ret = SD_RES_NO_SPACE; goto out; } *start = fd->start; fd->start += count; fd->count -= count; hd->used += count; /* Update the meta object */ ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); if (ret != SD_RES_SUCCESS) sd_err("failed to update meta %"PRIx64 ", %s", oid, sd_strerror(ret)); out: free(meta); return ret; } /* * Update the inode map of the vid * * @vid: the vdi where the allocator resides * @start: start index of the objects to update * @count: number of the objects to update */ int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count) { struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); int ret; ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(*inode), 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode, %" PRIx64 ", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } for (uint64_t i = 0; i < count; i++) INODE_SET_VID(inode, start + i, vid); ret = sd_inode_write(sheep_bnode_writer, inode, 0, false, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update inode, %" PRIx64", %s", vid_to_vdi_oid(vid), sd_strerror(ret)); goto out; } out: free(inode); return ret; } static int free_desc_cmp(struct free_desc *a, struct free_desc *b) { return -intcmp(a->start, b->start); } static inline int update_and_merge_free_desc(char *meta, uint64_t start, uint64_t count, uint32_t vid) { struct header *hd = (struct header *)meta; struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd); uint64_t i, j; /* Try our best to merge it in place, or append it to tail */ for (i = 0; i < hd->nr_free; i++) { if (start + count == fd->start) { fd->start = start; fd->count += count; break; } else if(fd->start + fd->count == start) { fd->count +=count; break; } fd++; } if (i == hd->nr_free) { if (hd->nr_free >= MAX_FREE_DESC) return SD_RES_NO_SPACE; tail = (struct free_desc *)(meta + oalloc_meta_length(hd)); tail->start = start; tail->count = count; hd->nr_free++; } hd->used -= count; xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp); /* Merge as hard as we can */ j = hd->nr_free - 1; tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; for (i = 0; i < j; i++, tail--) { struct free_desc *front = tail - 1; sd_debug("start %"PRIu64", count %"PRIu64, tail->start, tail->count); if (tail->start + tail->count > front->start) sd_emerg("bad free descriptor found at %"PRIx32, vid); if (tail->start + tail->count == front->start) { front->start = tail->start; front->count += tail->count; memmove(tail, tail + 1, sizeof(*tail) * i); hd->nr_free--; } } return SD_RES_SUCCESS; } /* * Discard the allocted objects and update the free list of the allocator * * Caller should check the return value since it might fail. * * @vid: the vdi where the allocator resides * @start: start index of the objects to free * @count: number of the objects to free */ int oalloc_free(uint32_t vid, uint64_t start, uint64_t count) { char *meta = xvalloc(SD_DATA_OBJ_SIZE); struct header *hd; uint64_t oid = vid_to_data_oid(vid, 0), i; int ret; ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); if (ret != SD_RES_SUCCESS) { sd_err("failed to read meta %" PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } ret = update_and_merge_free_desc(meta, start, count, vid); if (ret != SD_RES_SUCCESS) goto out; /* XXX use aio to speed up discard of objects */ for (i = 0; i < count; i++) { struct sd_req hdr; sd_init_req(&hdr, SD_OP_DISCARD_OBJ); hdr.obj.oid = vid_to_data_oid(vid, start + i); ret = exec_local_req(&hdr, NULL); if (ret != SD_RES_SUCCESS) goto out; } hd = (struct header *)meta; ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); if (ret != SD_RES_SUCCESS) { sd_err("failed to update meta %"PRIx64 ", %s", oid, sd_strerror(ret)); goto out; } sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); out: free(meta); return ret; } sheepdog-0.8.3/sheep/http/s3.c000066400000000000000000000122321237656255000161070ustar00rootroot00000000000000/* * Copyright (C) 2013 MORITA Kazutaka * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "http.h" #define MAX_BUCKET_LISTING 1000 static void s3_write_err_response(struct http_request *req, const char *code, const char *desc) { http_request_writef(req, "\r\n" "\r\n" "%s\r\n%s\r\n" "\r\n", code, desc); } /* Operations on the Service */ static void s3_get_service_cb(const char *bucket, void *opaque) { } static void s3_get_service(struct http_request *req) { bool print_header = true; kv_iterate_bucket("s3", s3_get_service_cb, &print_header); http_request_writes(req, "\r\n"); } /* Operations on Buckets */ static void s3_head_bucket(struct http_request *req, const char *bucket) { http_response_header(req, NOT_IMPLEMENTED); } static void s3_get_bucket_cb(const char *object, void *opaque) { } static void s3_get_bucket(struct http_request *req, const char *bucket) { bool print_header = true; kv_iterate_object("s3", bucket, s3_get_bucket_cb, &print_header); switch (req->status) { case OK: http_request_writes(req, "\r\n"); break; case NOT_FOUND: s3_write_err_response(req, "NoSuchBucket", "The specified bucket does not exist"); break; default: break; } } static void s3_put_bucket(struct http_request *req, const char *bucket) { kv_create_bucket("s3", bucket); if (req->status == ACCEPTED) s3_write_err_response(req, "BucketAlreadyExists", "The requested bucket name is not available"); } static void s3_post_bucket(struct http_request *req, const char *bucket) { http_response_header(req, NOT_IMPLEMENTED); } static void s3_delete_bucket(struct http_request *req, const char *bucket) { kv_delete_bucket("s3", bucket); switch (req->status) { case NOT_FOUND: s3_write_err_response(req, "NoSuchBucket", "The specified bucket does not exist"); break; case CONFLICT: s3_write_err_response(req, "BucketNotEmpty", "The bucket you tried to delete is not empty"); break; default: break; } } /* Operations on Objects */ static void s3_head_object(struct http_request *req, const char *bucket, const char *object) { http_response_header(req, NOT_IMPLEMENTED); } static void s3_get_object(struct http_request *req, const char *bucket, const char *object) { kv_read_object(req, "s3", bucket, object); if (req->status == NOT_FOUND) s3_write_err_response(req, "NoSuchKey", "The resource you requested does not exist"); } static void s3_put_object(struct http_request *req, const char *bucket, const char *object) { kv_create_object(req, "s3", bucket, object); if (req->status == NOT_FOUND) s3_write_err_response(req, "NoSuchBucket", "The specified bucket does not exist"); } static void s3_post_object(struct http_request *req, const char *bucket, const char *object) { http_response_header(req, NOT_IMPLEMENTED); } static void s3_delete_object(struct http_request *req, const char *bucket, const char *object) { kv_delete_object("s3", bucket, object); if (req->status == NOT_FOUND) s3_write_err_response(req, "NoSuchKey", "The resource you requested does not exist"); } /* S3 driver interfaces */ static int s3_init(const char *option) { return 0; } static void s3_handle_request(struct http_request *req, void (*s_handler)(struct http_request *req), void (*b_handler)(struct http_request *req, const char *bucket), void (*o_handler)(struct http_request *req, const char *bucket, const char *object)) { char *args[2] = {}; char *bucket, *object; split_path(req->uri, ARRAY_SIZE(args), args); bucket = args[0]; object = args[1]; sd_info("%s", str_http_req(req)); if (bucket == NULL) { if (s_handler) { sd_info("service operation"); s_handler(req); } } else if (object == NULL) { sd_info("bucket operation, %s", bucket); b_handler(req, bucket); } else { sd_info("object operation, %s, %s", bucket, object); o_handler(req, bucket, object); } sd_info("%s", str_http_req(req)); free(bucket); free(object); } static void s3_head(struct http_request *req) { s3_handle_request(req, NULL, s3_head_bucket, s3_head_object); } static void s3_get(struct http_request *req) { s3_handle_request(req, s3_get_service, s3_get_bucket, s3_get_object); } static void s3_put(struct http_request *req) { s3_handle_request(req, NULL, s3_put_bucket, s3_put_object); } static void s3_post(struct http_request *req) { s3_handle_request(req, NULL, s3_post_bucket, s3_post_object); } static void s3_delete(struct http_request *req) { s3_handle_request(req, NULL, s3_delete_bucket, s3_delete_object); } static struct http_driver hdrv_s3 = { .name = "s3", .init = s3_init, .head = s3_head, .get = s3_get, .put = s3_put, .post = s3_post, .delete = s3_delete, }; hdrv_register(hdrv_s3); sheepdog-0.8.3/sheep/http/swift.c000066400000000000000000000213441237656255000167220ustar00rootroot00000000000000/* * Copyright (C) 2013 MORITA Kazutaka * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "strbuf.h" #include "http.h" /* Operations on Accounts */ static void swift_head_account(struct http_request *req, const char *account) { int ret; ret = kv_read_account_meta(req, account); if (ret == SD_RES_SUCCESS) http_response_header(req, NO_CONTENT); else http_response_header(req, UNAUTHORIZED); } static void swift_get_account_cb(const char *bucket, void *opaque) { struct strbuf *buf = (struct strbuf *)opaque; strbuf_addf(buf, "%s\n", bucket); } static void swift_get_account(struct http_request *req, const char *account) { struct strbuf buf = STRBUF_INIT; int ret; ret = kv_iterate_bucket(account, swift_get_account_cb, &buf); switch (ret) { case SD_RES_SUCCESS: req->data_length = buf.len; http_response_header(req, OK); http_request_write(req, buf.buf, buf.len); break; case SD_RES_NO_VDI: http_response_header(req, NOT_FOUND); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } strbuf_release(&buf); } static void swift_put_account(struct http_request *req, const char *account) { int ret; ret = kv_create_account(account); if (ret == SD_RES_SUCCESS) http_response_header(req, CREATED); else if (ret == SD_RES_VDI_EXIST) http_response_header(req, ACCEPTED); else http_response_header(req, INTERNAL_SERVER_ERROR); } static void swift_post_account(struct http_request *req, const char *account) { http_response_header(req, NOT_IMPLEMENTED); } static void swift_delete_account(struct http_request *req, const char *account) { int ret; ret = kv_delete_account(req, account); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, NO_CONTENT); break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; case SD_RES_VDI_NOT_EMPTY: http_response_header(req, CONFLICT); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } /* Operations on Containers */ static void swift_head_container(struct http_request *req, const char *account, const char *container) { int ret; ret = kv_read_bucket(req, account, container); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, NO_CONTENT); break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } static void swift_get_container_cb(const char *object, void *opaque) { struct strbuf *buf = (struct strbuf *)opaque; strbuf_addf(buf, "%s\n", object); } static void swift_get_container(struct http_request *req, const char *account, const char *container) { struct strbuf buf = STRBUF_INIT; int ret; ret = kv_iterate_object(account, container, swift_get_container_cb, &buf); switch (ret) { case SD_RES_SUCCESS: req->data_length = buf.len; http_response_header(req, OK); http_request_write(req, buf.buf, buf.len); break; case SD_RES_NO_VDI: http_response_header(req, NOT_FOUND); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } strbuf_release(&buf); } static void swift_put_container(struct http_request *req, const char *account, const char *container) { int ret; ret = kv_create_bucket(account, container); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, CREATED); break; case SD_RES_VDI_EXIST: http_response_header(req, ACCEPTED); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } static void swift_post_container(struct http_request *req, const char *account, const char *container) { http_response_header(req, NOT_IMPLEMENTED); } static void swift_delete_container(struct http_request *req, const char *account, const char *container) { int ret; ret = kv_delete_bucket(account, container); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, NO_CONTENT); break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; case SD_RES_VDI_NOT_EMPTY: http_response_header(req, CONFLICT); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } /* Operations on Objects */ static void swift_head_object(struct http_request *req, const char *account, const char *container, const char *object) { int ret; ret = kv_read_object_meta(req, account, container, object); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, OK); break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } static void swift_get_object(struct http_request *req, const char *account, const char *container, const char *object) { int ret; ret = kv_read_object(req, account, container, object); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; case SD_RES_INVALID_PARMS: http_response_header(req, REQUEST_RANGE_NOT_SATISFIABLE); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } static void swift_put_object(struct http_request *req, const char *account, const char *container, const char *object) { int ret; ret = kv_create_object(req, account, container, object); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, CREATED); break; case SD_RES_NO_VDI: http_response_header(req, NOT_FOUND); break; case SD_RES_NO_SPACE: http_response_header(req, SERVICE_UNAVAILABLE); default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } static void swift_post_object(struct http_request *req, const char *account, const char *container, const char *object) { http_response_header(req, NOT_IMPLEMENTED); } static void swift_delete_object(struct http_request *req, const char *account, const char *container, const char *object) { int ret; ret = kv_delete_object(account, container, object); switch (ret) { case SD_RES_SUCCESS: http_response_header(req, NO_CONTENT); break; case SD_RES_NO_VDI: case SD_RES_NO_OBJ: http_response_header(req, NOT_FOUND); break; default: http_response_header(req, INTERNAL_SERVER_ERROR); break; } } /* Swift driver interfaces */ static int swift_init(const char *option) { return 0; } static void swift_handle_request(struct http_request *req, void (*a_handler)(struct http_request *req, const char *account), void (*c_handler)(struct http_request *req, const char *account, const char *container), void (*o_handler)(struct http_request *req, const char *account, const char *container, const char *object)) { char *args[4] = {}; char *version, *account, *container, *object; split_path(req->uri, ARRAY_SIZE(args), args); version = args[0]; account = args[1]; container = args[2]; object = args[3]; sd_info("%s", str_http_req(req)); if (account == NULL) { sd_info("invalid uri: %s", req->uri); http_response_header(req, NOT_FOUND); } else if (container == NULL) { sd_info("account operation, %s", account); a_handler(req, account); } else if (object == NULL) { sd_info("container operation, %s, %s", account, container); c_handler(req, account, container); } else { sd_info("object operation, %s, %s, %s", account, container, object); o_handler(req, account, container, object); } sd_info("%s", str_http_req(req)); free(version); free(account); free(container); free(object); } static void swift_head(struct http_request *req) { swift_handle_request(req, swift_head_account, swift_head_container, swift_head_object); } static void swift_get(struct http_request *req) { swift_handle_request(req, swift_get_account, swift_get_container, swift_get_object); } static void swift_put(struct http_request *req) { swift_handle_request(req, swift_put_account, swift_put_container, swift_put_object); } static void swift_post(struct http_request *req) { swift_handle_request(req, swift_post_account, swift_post_container, swift_post_object); } static void swift_delete(struct http_request *req) { swift_handle_request(req, swift_delete_account, swift_delete_container, swift_delete_object); } static struct http_driver hdrv_swift = { .name = "swift", .init = swift_init, .head = swift_head, .get = swift_get, .put = swift_put, .post = swift_post, .delete = swift_delete, }; hdrv_register(hdrv_swift); sheepdog-0.8.3/sheep/journal.c000066400000000000000000000236211237656255000162610ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct journal_file { int fd; off_t pos; int commit_fd; uatomic_bool in_commit; }; /* * CAUTION: This definition of struct journal_descriptor must be same * to the definition in tests/dynamorio/journaling/journaling.c. We * have to update the definition in the DR client definition if we * update the below definition because there's no technique for * keeping the consistency automatically. */ struct journal_descriptor { uint32_t magic; uint16_t flag; uint16_t reserved; uint64_t oid; uint64_t offset; uint64_t size; uint8_t create; uint8_t pad[475]; } __packed; /* JOURNAL_DESC + JOURNAL_MARKER must be 512 algined for DIO */ #define JOURNAL_DESC_MAGIC 0xfee1900d #define JOURNAL_DESC_SIZE 508 #define JOURNAL_MARKER_SIZE 4 /* Use marker to detect partial write */ #define JOURNAL_META_SIZE (JOURNAL_DESC_SIZE + JOURNAL_MARKER_SIZE) #define JOURNAL_END_MARKER 0xdeadbeef #define JF_STORE 0 #define JF_REMOVE_OBJ 2 static const char *jfile_name[2] = { "journal_file0", "journal_file1", }; static int jfile_fds[2]; static size_t jfile_size; static struct journal_file jfile; static struct sd_mutex jfile_lock = SD_MUTEX_INITIALIZER; static struct work_queue *commit_wq; static int create_journal_file(const char *root, const char *name) { int fd, flags = O_DSYNC | O_RDWR | O_TRUNC | O_CREAT | O_DIRECT; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%s", root, name); fd = open(path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); if (fd < 0) { sd_err("open %s %m", name); return -1; } if (prealloc(fd, jfile_size) < 0) { sd_err("prealloc %s %m", name); return -1; } return fd; } /* We should have two valid FDs, otherwise something goes wrong */ static int get_old_new_jfile(const char *p, int *old, int *new) { int fd1, fd2; int flags = O_RDONLY; char path[PATH_MAX]; struct stat st1, st2; snprintf(path, sizeof(path), "%s/%s", p, jfile_name[0]); fd1 = open(path, flags); if (fd1 < 0) { if (errno == ENOENT) return 0; sd_err("open1 %m"); return -1; } snprintf(path, sizeof(path), "%s/%s", p, jfile_name[1]); fd2 = open(path, flags); if (fd2 < 0) { sd_err("open2 %m"); close(fd1); return -1; } if (fstat(fd1, &st1) < 0 || fstat(fd2, &st2) < 0) { sd_err("stat %m"); goto out; } if (st1.st_mtime < st2.st_mtime) { *old = fd1; *new = fd2; } else { *old = fd2; *new = fd1; } return 0; out: close(fd1); close(fd2); return -1; } static bool journal_entry_full_write(struct journal_descriptor *jd) { char *end = (char *)jd + round_up(jd->size, SECTOR_SIZE) + JOURNAL_META_SIZE; uint32_t marker = *(((uint32_t *)end) - 1); if (marker != JOURNAL_END_MARKER) return false; return true; } static int replay_journal_entry(struct journal_descriptor *jd) { char path[PATH_MAX]; ssize_t size; int fd, flags = O_WRONLY, ret = 0; void *buf = NULL; char *p = (char *)jd; snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(jd->oid), jd->oid); if (jd->flag == JF_REMOVE_OBJ) { sd_info("%s (remove)", path); unlink(path); return 0; } if (jd->flag != JF_STORE) panic("flag is not JF_STORE, the journaling file is broken." " please remove the journaling file and restart sheep daemon"); sd_info("%s, size %" PRIu64 ", off %" PRIu64 ", %d", path, jd->size, jd->offset, jd->create); if (jd->create) flags |= O_CREAT; fd = open(path, flags, sd_def_fmode); if (fd < 0) { sd_err("open %m"); return -1; } if (jd->create) { ret = prealloc(fd, get_objsize(jd->oid)); if (ret < 0) goto out; } buf = xmalloc(jd->size); p += JOURNAL_DESC_SIZE; memcpy(buf, p, jd->size); size = xpwrite(fd, buf, jd->size, jd->offset); if (size != jd->size) { sd_err("write %zd, size %" PRIu64 ", errno %m", size, jd->size); ret = -1; goto out; } out: free(buf); close(fd); return ret; } static int do_recover(int fd) { struct journal_descriptor *jd; void *map; char *p, *end; struct stat st; if (fstat(fd, &st) < 0) { sd_err("fstat %m"); return -1; } if (!st.st_size) { /* * An empty journal file can be produced when sheep crashes * between ftruncate() and prealloc() of commit_data(). * Such a file should be ignored simply. */ close(fd); return 0; } map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (map == MAP_FAILED) { sd_err("%m"); return -1; } end = (char *)map + st.st_size; for (p = map; p < end;) { jd = (struct journal_descriptor *)p; if (jd->magic != JOURNAL_DESC_MAGIC) { /* Empty area */ p += SECTOR_SIZE; continue; } /* We skip partial write because it is not acked back to VM */ if (!journal_entry_full_write(jd)) goto skip; if (replay_journal_entry(jd) < 0) return -1; skip: p += JOURNAL_META_SIZE + round_up(jd->size, SECTOR_SIZE); } munmap(map, st.st_size); /* Do a final sync() to assure data is reached to the disk */ sync(); return 0; } /* * We recover the journal file in order of wall time in the corner case that * sheep crashes while in the middle of journal committing. For most of cases, * we actually only recover one jfile, the other would be empty. This process * is fast with buffered IO that only take several secends at most. */ static void check_recover_journal_file(const char *p) { int old = 0, new = 0; if (get_old_new_jfile(p, &old, &new) < 0) return; /* No journal file found */ if (old == 0) return; if (do_recover(old) < 0) panic("recoverying from journal file (old) failed"); if (do_recover(new) < 0) panic("recoverying from journal file (new) failed"); } int journal_file_init(const char *path, size_t size, bool skip) { int fd; if (!skip) check_recover_journal_file(path); jfile_size = size / 2; fd = create_journal_file(path, jfile_name[0]); if (fd < 0) return -1; jfile.fd = jfile_fds[0] = fd; fd = create_journal_file(path, jfile_name[1]); jfile_fds[1] = fd; commit_wq = create_ordered_work_queue("journal commit"); if (!commit_wq) { sd_err("error at creating a workqueue for journal data commit"); return -1; } return 0; } void clean_journal_file(const char *p) { int ret; char path[PATH_MAX]; sync(); snprintf(path, sizeof(path), "%s/%s", p, jfile_name[0]); ret = unlink(path); if (ret < 0) sd_err("unlink(%s): %m", path); snprintf(path, sizeof(path), "%s/%s", p, jfile_name[1]); ret = unlink(path); if (ret < 0) sd_err("unlink(%s): %m", path); } static inline bool jfile_enough_space(size_t size) { return (jfile.pos + size) < jfile_size; } static struct sd_mutex journal_commit_mutex = SD_MUTEX_INITIALIZER; /* * We rely on the kernel's page cache to cache data objects to 1) boost read * perfmance 2) simplify read path so that data commiting is simply a * sync() operation and We do it in a dedicated thread to avoid blocking * the writer by switch back and forth between two journal files. */ static void journal_commit_data_work(struct work *work) { sync(); if (unlikely(xftruncate(jfile.commit_fd, 0) < 0)) panic("truncate %m"); if (unlikely(prealloc(jfile.commit_fd, jfile_size) < 0)) panic("prealloc %m"); sd_mutex_unlock(&journal_commit_mutex); } static void journal_commit_data_done(struct work *work) { free(work); } static void switch_journal_file(void) { int old = jfile.fd; struct work *w; if (sd_mutex_trylock(&journal_commit_mutex) == EBUSY) { sd_err("journal file in commiting, you might need" " enlarge jfile size"); sd_mutex_lock(&journal_commit_mutex); } if (old == jfile_fds[0]) jfile.fd = jfile_fds[1]; else jfile.fd = jfile_fds[0]; jfile.commit_fd = old; jfile.pos = 0; w = xzalloc(sizeof(*w)); w->fn = journal_commit_data_work; w->done = journal_commit_data_done; queue_work(commit_wq, w); } static int journal_file_write(struct journal_descriptor *jd, const char *buf) { uint32_t marker = JOURNAL_END_MARKER; int ret = SD_RES_SUCCESS; uint64_t size = jd->size; ssize_t written, rusize = round_up(size, SECTOR_SIZE), wsize = JOURNAL_META_SIZE + rusize; off_t woff; char *wbuffer, *p; sd_mutex_lock(&jfile_lock); if (!jfile_enough_space(wsize)) switch_journal_file(); woff = jfile.pos; jfile.pos += wsize; sd_mutex_unlock(&jfile_lock); p = wbuffer = xvalloc(wsize); memcpy(p, jd, JOURNAL_DESC_SIZE); p += JOURNAL_DESC_SIZE; memcpy(p, buf, size); p += size; if (size < rusize) { memset(p, 0, rusize - size); p += rusize - size; } memcpy(p, &marker, JOURNAL_MARKER_SIZE); /* * Concurrent writes with the same FD is okay because we don't have any * critical sections that need lock inside kernel write path, since we * a) bypass page cache, b) don't modify i_size of this inode. * * Feel free to correct me If I am wrong. */ written = xpwrite(jfile.fd, wbuffer, wsize, woff); if (unlikely(written != wsize)) { sd_err("failed, written %zd, len %zd", written, wsize); /* FIXME: teach journal file handle EIO gracefully */ ret = SD_RES_EIO; goto out; } out: free(wbuffer); return ret; } int journal_write_store(uint64_t oid, const char *buf, size_t size, off_t offset, bool create) { struct journal_descriptor jd = { .magic = JOURNAL_DESC_MAGIC, .flag = JF_STORE, .offset = offset, .size = size, .create = create, .oid = oid, }; return journal_file_write(&jd, buf); } int journal_remove_object(uint64_t oid) { struct journal_descriptor jd = { .magic = JOURNAL_DESC_MAGIC, .flag = JF_REMOVE_OBJ, .size = 0, .oid = oid, }; return journal_file_write(&jd, NULL); } static __attribute__((used)) void journal_c_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct journal_descriptor) != JOURNAL_DESC_SIZE); } sheepdog-0.8.3/sheep/md.c000066400000000000000000000422331237656255000152070ustar00rootroot00000000000000/* * Copyright (C) 2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #define MD_VDISK_SIZE ((uint64_t)1*1024*1024*1024) /* 1G */ #define NONE_EXIST_PATH "/all/disks/are/broken/,ps/əʌo7/!" struct disk { struct rb_node rb; char path[PATH_MAX]; uint64_t space; }; struct vdisk { struct rb_node rb; struct disk *disk; uint64_t hash; }; struct md { struct rb_root vroot; struct rb_root root; struct sd_rw_lock lock; uint64_t space; uint32_t nr_disks; }; static struct md md = { .vroot = RB_ROOT, .root = RB_ROOT, .lock = SD_RW_LOCK_INITIALIZER, }; static inline int nr_online_disks(void) { int nr; sd_read_lock(&md.lock); nr = md.nr_disks; sd_rw_unlock(&md.lock); return nr; } static inline int vdisk_number(const struct disk *disk) { return DIV_ROUND_UP(disk->space, MD_VDISK_SIZE); } static int disk_cmp(const struct disk *d1, const struct disk *d2) { return strcmp(d1->path, d2->path); } static int vdisk_cmp(const struct vdisk *d1, const struct vdisk *d2) { return intcmp(d1->hash, d2->hash); } static struct vdisk *vdisk_insert(struct vdisk *new) { return rb_insert(&md.vroot, new, rb, vdisk_cmp); } /* If v1_hash < hval <= v2_hash, then oid is resident in v2 */ static struct vdisk *hval_to_vdisk(uint64_t hval) { struct vdisk dummy = { .hash = hval }; return rb_nsearch(&md.vroot, &dummy, rb, vdisk_cmp); } static struct vdisk *oid_to_vdisk(uint64_t oid) { return hval_to_vdisk(sd_hash_oid(oid)); } static void create_vdisks(struct disk *disk) { uint64_t hval = sd_hash(disk->path, strlen(disk->path)); int nr = vdisk_number(disk); for (int i = 0; i < nr; i++) { struct vdisk *v = xmalloc(sizeof(*v)); hval = sd_hash_next(hval); v->hash = hval; v->disk = disk; if (unlikely(vdisk_insert(v))) panic("vdisk hash collison"); } } static inline void vdisk_free(struct vdisk *v) { rb_erase(&v->rb, &md.vroot); free(v); } static void remove_vdisks(const struct disk *disk) { uint64_t hval = sd_hash(disk->path, strlen(disk->path)); int nr = vdisk_number(disk); for (int i = 0; i < nr; i++) { struct vdisk *v; hval = sd_hash_next(hval); v = hval_to_vdisk(hval); assert(v->hash == hval); vdisk_free(v); } } static inline void trim_last_slash(char *path) { assert(path[0]); while (path[strlen(path) - 1] == '/') path[strlen(path) - 1] = '\0'; } static struct disk *path_to_disk(const char *path) { struct disk key = {}; pstrcpy(key.path, sizeof(key.path), path); trim_last_slash(key.path); return rb_search(&md.root, &key, rb, disk_cmp); } static int get_total_object_size(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *total) { uint64_t *t = total; struct stat s; char path[PATH_MAX]; snprintf(path, PATH_MAX, "%s/%016" PRIx64, wd, oid); if (stat(path, &s) == 0) *t += s.st_blocks * SECTOR_SIZE; else *t += get_store_objsize(oid); return SD_RES_SUCCESS; } static int64_t find_string_integer(const char *str, const char *delimiter) { char *pos = strstr(str, delimiter), *p; int64_t ret; ret = strtoll(pos + 1, &p, 10); if (ret == LLONG_MAX || p == pos + 1) { sd_err("%s strtoul failed, delimiter %s, %m", str, delimiter); return -1; } return ret; } /* If cleanup is true, temporary objects will be removed */ static int for_each_object_in_path(const char *path, int (*func)(uint64_t, const char *, uint32_t, uint8_t, struct vnode_info *, void *), bool cleanup, struct vnode_info *vinfo, void *arg) { DIR *dir; struct dirent *d; uint64_t oid; int ret = SD_RES_SUCCESS; dir = opendir(path); if (unlikely(!dir)) { sd_err("failed to open %s, %m", path); return SD_RES_EIO; } while ((d = readdir(dir))) { uint32_t epoch = 0; uint8_t ec_index = SD_MAX_COPIES; /* skip ".", ".." and ".stale" */ if (unlikely(!strncmp(d->d_name, ".", 1))) continue; sd_debug("%s, %s", path, d->d_name); oid = strtoull(d->d_name, NULL, 16); if (oid == 0 || oid == ULLONG_MAX) continue; /* don't call callback against temporary objects */ if (is_tmp_dentry(d->d_name)) { if (cleanup) { sd_debug("remove tmp object %s", d->d_name); unlink(d->d_name); } continue; } if (is_stale_dentry(d->d_name)) { epoch = find_string_integer(d->d_name, "."); if (epoch < 0) continue; } if (is_ec_dentry(d->d_name)) { ec_index = find_string_integer(d->d_name, "_"); if (ec_index < 0) continue; } ret = func(oid, path, epoch, ec_index, vinfo, arg); if (ret != SD_RES_SUCCESS) break; } closedir(dir); return ret; } static uint64_t get_path_free_size(const char *path, uint64_t *used) { struct statvfs fs; uint64_t size; if (statvfs(path, &fs) < 0) { sd_err("get disk %s space failed %m", path); return 0; } size = (int64_t)fs.f_frsize * fs.f_bavail; if (!used) goto out; if (for_each_object_in_path(path, get_total_object_size, false, NULL, used) != SD_RES_SUCCESS) return 0; out: return size; } /* * If path is broken during initilization or not support xattr return 0. We can * safely use 0 to represent failure case because 0 space path can be * considered as broken path. */ static uint64_t init_path_space(const char *path, bool purge) { uint64_t size; char stale[PATH_MAX]; if (!is_xattr_enabled(path)) { sd_warn("multi-disk support need xattr feature for path: %s", path); goto broken_path; } if (purge && purge_directory(path) < 0) sd_err("failed to purge %s", path); snprintf(stale, PATH_MAX, "%s/.stale", path); if (xmkdir(stale, sd_def_dmode) < 0) { sd_err("can't mkdir for %s, %m", stale); goto broken_path; } #define MDNAME "user.md.size" #define MDSIZE sizeof(uint64_t) if (getxattr(path, MDNAME, &size, MDSIZE) < 0) { if (errno == ENODATA) { goto create; } else { sd_err("%s, %m", path); goto broken_path; } } return size; create: size = get_path_free_size(path, NULL); if (!size) goto broken_path; if (setxattr(path, MDNAME, &size, MDSIZE, 0) < 0) { sd_err("%s, %m", path); goto broken_path; } return size; broken_path: return 0; } /* We don't need lock at init stage */ bool md_add_disk(const char *path, bool purge) { struct disk *new; if (path_to_disk(path)) { sd_err("duplicate path %s", path); return false; } if (xmkdir(path, sd_def_dmode) < 0) { sd_err("can't mkdir for %s, %m", path); return false; } new = xmalloc(sizeof(*new)); pstrcpy(new->path, PATH_MAX, path); trim_last_slash(new->path); new->space = init_path_space(new->path, purge); if (!new->space) { free(new); return false; } create_vdisks(new); rb_insert(&md.root, new, rb, disk_cmp); md.space += new->space; md.nr_disks++; sd_info("%s, vdisk nr %d, total disk %d", new->path, vdisk_number(new), md.nr_disks); return true; } static inline void md_remove_disk(struct disk *disk) { sd_info("%s from multi-disk array", disk->path); rb_erase(&disk->rb, &md.root); md.nr_disks--; remove_vdisks(disk); free(disk); } uint64_t md_init_space(void) { return md.space; } static const char *md_get_object_dir_nolock(uint64_t oid) { const struct vdisk *vd; if (unlikely(md.nr_disks == 0)) return NONE_EXIST_PATH; /* To generate EIO */ vd = oid_to_vdisk(oid); return vd->disk->path; } const char *md_get_object_dir(uint64_t oid) { const char *p; sd_read_lock(&md.lock); p = md_get_object_dir_nolock(oid); sd_rw_unlock(&md.lock); return p; } struct process_path_arg { const char *path; struct vnode_info *vinfo; int (*func)(uint64_t oid, const char *, uint32_t, uint8_t, struct vnode_info *, void *arg); bool cleanup; void *opaque; int result; }; static void *thread_process_path(void *arg) { int ret = SD_RES_SUCCESS; struct process_path_arg *parg = (struct process_path_arg *)arg; ret = for_each_object_in_path(parg->path, parg->func, parg->cleanup, parg->vinfo, parg->opaque); if (ret != SD_RES_SUCCESS) parg->result = ret; return arg; } main_fn int for_each_object_in_wd(int (*func)(uint64_t oid, const char *path, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg), bool cleanup, void *arg) { int ret = SD_RES_SUCCESS; const struct disk *disk; struct process_path_arg *thread_args, *path_arg; struct vnode_info *vinfo; void *ret_arg; pthread_t *thread_array; int nr_thread = 0, idx = 0; sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { nr_thread++; } thread_args = xmalloc(nr_thread * sizeof(struct process_path_arg)); thread_array = xmalloc(nr_thread * sizeof(pthread_t)); vinfo = get_vnode_info(); rb_for_each_entry(disk, &md.root, rb) { thread_args[idx].path = disk->path; thread_args[idx].vinfo = vinfo; thread_args[idx].func = func; thread_args[idx].cleanup = cleanup; thread_args[idx].opaque = arg; thread_args[idx].result = SD_RES_SUCCESS; ret = pthread_create(thread_array + idx, NULL, thread_process_path, (void *)(thread_args + idx)); if (ret) { /* * If we can't create enough threads to process * files, the data-consistent will be broken if * we continued. */ panic("Failed to create thread for path %s", disk->path); } idx++; } sd_debug("Create %d threads for all path", nr_thread); /* wait for all threads to exit */ for (idx = 0; idx < nr_thread; idx++) { ret = pthread_join(thread_array[idx], &ret_arg); if (ret) sd_err("Failed to join thread"); if (ret_arg) { path_arg = (struct process_path_arg *)ret_arg; if (path_arg->result != SD_RES_SUCCESS) sd_err("%s, %s", path_arg->path, sd_strerror(path_arg->result)); } } put_vnode_info(vinfo); sd_rw_unlock(&md.lock); free(thread_args); free(thread_array); return ret; } int for_each_object_in_stale(int (*func)(uint64_t oid, const char *path, uint32_t epoch, uint8_t, struct vnode_info *, void *arg), void *arg) { int ret = SD_RES_SUCCESS; char path[PATH_MAX]; const struct disk *disk; sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { snprintf(path, sizeof(path), "%s/.stale", disk->path); ret = for_each_object_in_path(path, func, false, NULL, arg); if (ret != SD_RES_SUCCESS) break; } sd_rw_unlock(&md.lock); return ret; } int for_each_obj_path(int (*func)(const char *path)) { int ret = SD_RES_SUCCESS; const struct disk *disk; sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { ret = func(disk->path); if (ret != SD_RES_SUCCESS) break; } sd_rw_unlock(&md.lock); return ret; } struct md_work { struct work work; char path[PATH_MAX]; }; static inline void kick_recover(void) { struct vnode_info *vinfo = get_vnode_info(); start_recovery(vinfo, vinfo, false); put_vnode_info(vinfo); } static void md_do_recover(struct work *work) { struct md_work *mw = container_of(work, struct md_work, work); struct disk *disk; int nr = 0; sd_write_lock(&md.lock); disk = path_to_disk(mw->path); if (!disk) /* Just ignore the duplicate EIO of the same path */ goto out; md_remove_disk(disk); nr = md.nr_disks; out: sd_rw_unlock(&md.lock); if (nr > 0) kick_recover(); free(mw); } int md_handle_eio(const char *fault_path) { struct md_work *mw; if (nr_online_disks() == 0) return SD_RES_EIO; mw = xzalloc(sizeof(*mw)); mw->work.done = md_do_recover; pstrcpy(mw->path, PATH_MAX, fault_path); queue_work(sys->md_wqueue, &mw->work); /* Fool the requester to retry */ return SD_RES_NETWORK_ERROR; } static inline bool md_access(const char *path) { if (access(path, R_OK | W_OK) < 0) { if (unlikely(errno != ENOENT)) sd_err("failed to check %s, %m", path); return false; } return true; } static int get_old_new_path(uint64_t oid, uint32_t epoch, uint8_t ec_index, const char *path, char *old, char *new) { if (!epoch) { if (!is_erasure_oid(oid)) { snprintf(old, PATH_MAX, "%s/%016" PRIx64, path, oid); snprintf(new, PATH_MAX, "%s/%016" PRIx64, md_get_object_dir_nolock(oid), oid); } else { snprintf(old, PATH_MAX, "%s/%016" PRIx64"_%d", path, oid, ec_index); snprintf(new, PATH_MAX, "%s/%016" PRIx64"_%d", md_get_object_dir_nolock(oid), oid, ec_index); } } else { if (!is_erasure_oid(oid)) { snprintf(old, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, path, oid, epoch); snprintf(new, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, md_get_object_dir_nolock(oid), oid, epoch); } else { snprintf(old, PATH_MAX, "%s/.stale/%016"PRIx64"_%d.%"PRIu32, path, oid, ec_index, epoch); snprintf(new, PATH_MAX, "%s/.stale/%016"PRIx64"_%d.%"PRIu32, md_get_object_dir_nolock(oid), oid, ec_index ,epoch); } } if (!md_access(old)) return -1; return 0; } static int md_move_object(uint64_t oid, const char *old, const char *new) { struct strbuf buf = STRBUF_INIT; int fd, ret = -1; size_t sz = get_store_objsize(oid); fd = open(old, O_RDONLY); if (fd < 0) { sd_err("failed to open %s", old); goto out; } ret = strbuf_read(&buf, fd, sz); if (ret != sz) { sd_err("failed to read %s, size %zu, %d, %m", old, sz, ret); ret = -1; goto out_close; } if (atomic_create_and_write(new, buf.buf, buf.len, false) < 0) { sd_err("failed to create %s", new); ret = -1; goto out_close; } unlink(old); ret = 0; out_close: close(fd); out: strbuf_release(&buf); return ret; } static int md_check_and_move(uint64_t oid, uint32_t epoch, uint8_t ec_index, const char *path) { char old[PATH_MAX], new[PATH_MAX]; if (get_old_new_path(oid, epoch, ec_index, path, old, new) < 0) return SD_RES_EIO; /* * Recovery thread and main thread might try to recover the same object. * Either one succeeds, the other will fail and proceed and end up * trying to move the object to where it is already in place, in this * case we simply return. */ if (!strcmp(old, new)) return SD_RES_SUCCESS; /* We can't use rename(2) accross device */ if (md_move_object(oid, old, new) < 0) { sd_err("move old %s to new %s failed", old, new); return SD_RES_EIO; } sd_debug("from %s to %s", old, new); return SD_RES_SUCCESS; } static int scan_wd(uint64_t oid, uint32_t epoch, uint8_t ec_index) { int ret = SD_RES_EIO; const struct disk *disk; sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { ret = md_check_and_move(oid, epoch, ec_index, disk->path); if (ret == SD_RES_SUCCESS) break; } sd_rw_unlock(&md.lock); return ret; } bool md_exist(uint64_t oid, uint8_t ec_index) { char path[PATH_MAX]; get_store_path(oid, ec_index, path); if (md_access(path)) return true; /* * We have to iterate the WD because we don't have epoch-like history * track to locate the objects for multiple disk failure. Simply do * hard iteration simplify the code a lot. */ if (scan_wd(oid, 0, ec_index) == SD_RES_SUCCESS) return true; return false; } int md_get_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index, char *path) { if (unlikely(!epoch)) panic("invalid 0 epoch"); if (is_erasure_oid(oid)) { if (unlikely(ec_index >= SD_MAX_COPIES)) panic("invalid ec index %d", ec_index); snprintf(path, PATH_MAX, "%s/.stale/%016"PRIx64"_%d.%"PRIu32, md_get_object_dir(oid), oid, ec_index, epoch); } else snprintf(path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, md_get_object_dir(oid), oid, epoch); if (md_access(path)) return SD_RES_SUCCESS; if (scan_wd(oid, epoch, ec_index) == SD_RES_SUCCESS) return SD_RES_SUCCESS; return SD_RES_NO_OBJ; } uint32_t md_get_info(struct sd_md_info *info) { uint32_t ret = sizeof(*info); const struct disk *disk; int i = 0; memset(info, 0, ret); sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { info->disk[i].idx = i; pstrcpy(info->disk[i].path, PATH_MAX, disk->path); /* FIXME: better handling failure case. */ info->disk[i].free = get_path_free_size(info->disk[i].path, &info->disk[i].used); i++; } info->nr = md.nr_disks; sd_rw_unlock(&md.lock); return ret; } static inline void md_del_disk(const char *path) { struct disk *disk = path_to_disk(path); if (!disk) { sd_err("invalid path %s", path); return; } md_remove_disk(disk); } static int do_plug_unplug(char *disks, bool plug) { const char *path; int old_nr, ret = SD_RES_UNKNOWN; sd_write_lock(&md.lock); old_nr = md.nr_disks; path = strtok(disks, ","); do { if (plug) { if (!md_add_disk(path, true)) sd_err("failed to add %s", path); } else { md_del_disk(path); } } while ((path = strtok(NULL, ","))); /* If no disks change, bail out */ if (old_nr == md.nr_disks) goto out; ret = SD_RES_SUCCESS; out: sd_rw_unlock(&md.lock); if (ret == SD_RES_SUCCESS) kick_recover(); return ret; } int md_plug_disks(char *disks) { return do_plug_unplug(disks, true); } int md_unplug_disks(char *disks) { return do_plug_unplug(disks, false); } uint64_t md_get_size(uint64_t *used) { uint64_t fsize = 0; const struct disk *disk; *used = 0; sd_read_lock(&md.lock); rb_for_each_entry(disk, &md.root, rb) { fsize += get_path_free_size(disk->path, used); } sd_rw_unlock(&md.lock); return fsize + *used; } sheepdog-0.8.3/sheep/migrate.c000066400000000000000000000253261237656255000162430ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* sheepdog 0.4.0 */ struct node_id_v0 { uint8_t addr[16]; uint16_t port; }; struct sd_node_v0 { struct node_id_v0 nid; uint16_t nr_vnodes; uint32_t zone; }; struct sheepdog_config_v0 { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; }; /* sheepdog 0.5.1 */ struct node_id_v1 { uint8_t addr[16]; uint16_t port; }; struct sd_node_v1 { struct node_id_v1 nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; struct sheepdog_config_v1 { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; uint8_t __pad[3]; uint16_t version; uint64_t space; }; /* sheepdog 0.6.0 */ struct node_id_v2 { uint8_t addr[16]; uint16_t port; uint8_t io_addr[16]; uint16_t io_port; uint8_t pad[4]; }; struct sd_node_v2 { struct node_id_v2 nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; /* sheepdog_config_v2 is the same as v1 */ #define sheepdog_config_v2 sheepdog_config_v1 static size_t get_file_size(const char *path) { struct stat stbuf; int ret; ret = stat(path, &stbuf); if (ret < 0) { sd_err("failed to stat %s, %m", path); return -1; } return stbuf.st_size; } static void for_each_epoch(int (*func)(uint32_t epoch)) { DIR *dir; struct dirent *d; dir = opendir(epoch_path); if (!dir) panic("failed to open %s: %m", epoch_path); while ((d = readdir(dir))) { uint32_t e; char *p; e = strtol(d->d_name, &p, 10); if (d->d_name == p) continue; if (strlen(d->d_name) != 8) continue; if (func(e) != 0) return; } closedir(dir); } /* copy file from 'fname' to 'fname.suffix' */ static int backup_file(char *fname, char *suffix) { char dst_file[PATH_MAX]; int fd = -1, ret = -1, len; void *buf = NULL; snprintf(dst_file, sizeof(dst_file), "%s.%s", fname, suffix); fd = open(fname, O_RDONLY); if (fd < 0) { if (errno != ENOENT) { sd_err("failed to open %s, %m", fname); ret = -1; } else ret = 0; goto out; } len = get_file_size(fname); if (len < 0) goto out; buf = xmalloc(len); ret = xread(fd, buf, len); if (ret != len) { sd_err("failed to read %s, %d %m", fname, ret); ret = -1; goto out; } close(fd); fd = open(dst_file, O_CREAT | O_WRONLY | O_DSYNC, 0644); if (fd < 0) { sd_err("failed to create %s, %m", dst_file); ret = -1; goto out; } ret = xwrite(fd, buf, len); if (ret != len) { sd_err("failed to write to %s, %d %m", dst_file, ret); ret = -1; } out: if (fd >= 0) close(fd); free(buf); return ret; } static int backup_epoch(uint32_t epoch) { char path[PATH_MAX]; char suffix[256]; struct timeval tv; struct tm tm; gettimeofday(&tv, NULL); localtime_r(&tv.tv_sec, &tm); strftime(suffix, sizeof(suffix), "%Y-%m-%d_%H%M%S", &tm); snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); return backup_file(path, suffix); } /* backup config and epoch info */ static int backup_store(void) { char suffix[256]; struct timeval tv; struct tm tm; int ret; gettimeofday(&tv, NULL); localtime_r(&tv.tv_sec, &tm); strftime(suffix, sizeof(suffix), "%Y-%m-%d_%H%M%S", &tm); ret = backup_file(config_path, suffix); if (ret < 0) return ret; for_each_epoch(backup_epoch); return 0; } static int update_epoch_from_v0_to_v1(uint32_t epoch) { char path[PATH_MAX]; struct sd_node_v0 nodes_v0[SD_MAX_NODES]; struct sd_node_v1 nodes_v1[SD_MAX_NODES]; size_t nr_nodes; time_t *t; int len, fd, ret; snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_DSYNC); if (fd < 0) { if (errno == ENOENT) return 0; sd_err("failed to open epoch %"PRIu32" log", epoch); return -1; } ret = xread(fd, nodes_v0, sizeof(nodes_v0)); if (ret < 0) { sd_err("failed to read epoch %"PRIu32" log", epoch); close(fd); return ret; } nr_nodes = ret / sizeof(nodes_v0[0]); for (int i = 0; i < nr_nodes; i++) { memcpy(&nodes_v1[i].nid, &nodes_v0[i].nid, sizeof(struct node_id_v1)); nodes_v1[i].nr_vnodes = nodes_v0[i].nr_vnodes; nodes_v1[i].zone = nodes_v0[i].zone; nodes_v1[i].space = 0; } len = sizeof(nodes_v1[0]) * nr_nodes; ret = xpwrite(fd, nodes_v1, len, 0); if (ret != len) { sd_err("failed to write epoch %"PRIu32" log", epoch); close(fd); return -1; } t = (time_t *)&nodes_v0[nr_nodes]; ret = xpwrite(fd, t, sizeof(*t), len); if (ret != sizeof(*t)) { sd_err("failed to write time to epoch %" PRIu32 " log", epoch); close(fd); return -1; } close(fd); return 0; } static int migrate_from_v0_to_v1(void) { int ret, fd; struct sheepdog_config_v1 config; fd = open(config_path, O_RDWR); if (fd < 0) { sd_err("failed to open config file, %m"); return -1; } memset(&config, 0, sizeof(config)); ret = xread(fd, &config, sizeof(config)); if (ret < 0) { sd_err("failed to read config file, %m"); close(fd); return ret; } config.version = 1; ret = xpwrite(fd, &config, sizeof(config), 0); if (ret != sizeof(config)) { sd_err("failed to write config data, %m"); close(fd); return -1; } /* 0.5.1 could wrongly extend the config file, so truncate it here */ ret = xftruncate(fd, sizeof(config)); if (ret != 0) { sd_err("failed to truncate config data, %m"); close(fd); return -1; } close(fd); /* * If the config file contains a space field, the store layout * is compatible with v1. In this case, what we need to do is * only adding version number to the config file. */ if (config.space > 0) return 0; /* upgrade epoch log */ for_each_epoch(update_epoch_from_v0_to_v1); return ret; } static int update_epoch_from_v1_to_v2(uint32_t epoch) { char path[PATH_MAX]; struct sd_node_v1 nodes_v1[SD_MAX_NODES]; struct sd_node_v2 nodes_v2[SD_MAX_NODES]; size_t nr_nodes; time_t *t; int len, fd, ret; snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_DSYNC); if (fd < 0) { if (errno == ENOENT) return 0; sd_err("failed to open epoch %"PRIu32" log", epoch); return -1; } /* * sheepdog 0.5.6 was released without incrementing the config version. * We detect it by 1) checking the size of epoch file, and 2) checking * the value of sd_node.nid.port */ if ((get_file_size(path) - sizeof(time_t)) % sizeof(nodes_v1[0]) != 0) { sd_debug("%s is not a v1 format", path); close(fd); return 0; } ret = xread(fd, nodes_v1, sizeof(nodes_v1)); if (ret < 0) { sd_err("failed to read epoch %"PRIu32" log", epoch); close(fd); return ret; } nr_nodes = ret / sizeof(nodes_v1[0]); for (int i = 0; i < nr_nodes; i++) { if (nodes_v1[i].nid.port == 0) { sd_debug("%s is not a v1 format", path); return 0; } memset(&nodes_v2[i].nid, 0, sizeof(nodes_v2[i].nid)); memcpy(nodes_v2[i].nid.addr, nodes_v1[i].nid.addr, sizeof(nodes_v2[i].nid.addr)); nodes_v2[i].nid.port = nodes_v1[i].nid.port; nodes_v2[i].nr_vnodes = nodes_v1[i].nr_vnodes; nodes_v2[i].zone = nodes_v1[i].zone; nodes_v2[i].space = nodes_v1[i].space; } len = sizeof(nodes_v2[0]) * nr_nodes; ret = xpwrite(fd, nodes_v2, len, 0); if (ret != len) { sd_err("failed to write epoch %"PRIu32" log", epoch); close(fd); return -1; } t = (time_t *)&nodes_v1[nr_nodes]; ret = xpwrite(fd, t, sizeof(*t), len); if (ret != sizeof(*t)) { sd_err("failed to write time to epoch %" PRIu32 " log", epoch); close(fd); return -1; } close(fd); return 0; } static int migrate_from_v1_to_v2(void) { int fd, ret; uint16_t version = 2; char store[STORE_LEN] = "plain"; /* we have only the plain driver */ fd = open(config_path, O_WRONLY | O_DSYNC); if (fd < 0) { sd_err("failed to open config file, %m"); return -1; } ret = xpwrite(fd, &version, sizeof(version), offsetof(struct sheepdog_config_v2, version)); if (ret != sizeof(version)) { sd_err("failed to write config data, %m"); close(fd); return -1; } ret = xpwrite(fd, store, sizeof(store), offsetof(struct sheepdog_config_v2, store)); if (ret != sizeof(store)) { sd_err("failed to write config data, %m"); close(fd); return -1; } close(fd); /* upgrade epoch log */ for_each_epoch(update_epoch_from_v1_to_v2); return ret; } static int migrate_from_v2_to_v3(void) { sd_err("not implemented"); return -1; } #define OLD_ECNAME "user.ec.index" static int convert_ecidx_xattr2path(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *info, void *arg) { int ret = 0; uint8_t idx; char path[PATH_MAX + 1], new_path[PATH_MAX + 1]; bool is_stale = *(bool *)arg; if (is_stale) snprintf(path, PATH_MAX, "%s/%016"PRIx64".%u", wd, oid, epoch); else snprintf(path, PATH_MAX, "%s/%016"PRIx64, wd, oid); if (getxattr(path, OLD_ECNAME, &idx, sizeof(uint8_t)) < 0) { sd_info("object: %s doesn't have its ec index in xattr: %m", path); goto out; } if (is_stale) snprintf(new_path, PATH_MAX, "%s/%016"PRIx64"_%u.%u", wd, oid, idx, epoch); else snprintf(new_path, PATH_MAX, "%s/%016"PRIx64"_%u", wd, oid, idx); if (rename(path, new_path) < 0) { sd_emerg("rename from %s to %s failed: %m", path, new_path); ret = -1; goto out; } if (removexattr(new_path, OLD_ECNAME) < 0) { sd_emerg("remove xattr %s from path %s failed: %m", OLD_ECNAME, new_path); ret = -1; } out: return ret; } static int migrate_from_v3_to_v4(void) { bool is_stale = true; int ret; ret = for_each_object_in_stale(convert_ecidx_xattr2path, (void *)&is_stale); if (ret < 0) { sd_emerg("converting store format of stale object directory" "failed"); return ret; } is_stale = false; ret = for_each_object_in_wd(convert_ecidx_xattr2path, false, (void *)&is_stale); if (ret < 0) { sd_emerg("converting store format of object directory failed"); return ret; } sd_info("converting store format v3 to v4 is ended successfully"); return 0; } static int (*migrate[])(void) = { migrate_from_v0_to_v1, /* from 0.4.0 or 0.5.0 to 0.5.1 */ migrate_from_v1_to_v2, /* from 0.5.x to 0.6.0 */ migrate_from_v2_to_v3, /* from 0.6.x or 0.7.x to 0.8.x */ /* * from v0.8.0 to v0.8.x (0 < x), for solving incompatibility * produced by the commit 79706e07a068 */ migrate_from_v3_to_v4, }; int sd_migrate_store(int from, int to) { int ver, ret; assert(to <= sizeof(migrate)); ret = backup_store(); if (ret != 0) { sd_err("failed to backup the old store"); return ret; } for (ver = from; ver < to; ver++) { ret = migrate[ver](); if (ret < 0) return ret; } /* success */ return 0; } sheepdog-0.8.3/sheep/object_cache.c000066400000000000000000001024051237656255000171760ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* * Object Cache ID * * 0 - 31 (32 bits): data object space * 32 - 51 (20 bits): reserved * 52 - 59 (8 bits): object flag space * 60 - 63 (4 bits): object type indentifier space */ #define CACHE_VDI_SHIFT 63 /* if the entry is identified as VDI object */ #define CACHE_CREATE_SHIFT 59 /* If the entry should be created at backend */ #define CACHE_VDI_BIT (UINT64_C(1) << CACHE_VDI_SHIFT) #define CACHE_CREATE_BIT (UINT64_C(1) << CACHE_CREATE_SHIFT) #define CACHE_INDEX_MASK (CACHE_CREATE_BIT) #define CACHE_OBJECT_SIZE (SD_DATA_OBJ_SIZE / 1024 / 1024) /* M */ /* Kick background pusher if dirty_count greater than it */ #define MAX_DIRTY_OBJECT_COUNT 10 /* Just a random number, no rationale */ struct global_cache { uint32_t capacity; /* The real capacity of object cache of this node */ uatomic_bool in_reclaim; /* If the relcaimer is working */ }; struct object_cache_entry { uint64_t idx; /* Index of this entry */ refcnt_t refcnt; /* Reference count of this entry */ uint64_t bmap; /* Each bit represents one dirty block in object */ struct object_cache *oc; /* Object cache this entry belongs to */ struct rb_node node; /* For lru tree of object cache */ struct list_node dirty_list; /* For dirty list of object cache */ struct list_node lru_list; /* For lru list of object cache */ struct sd_rw_lock lock; /* Entry lock */ }; struct object_cache { uint32_t vid; /* The VID of this VDI */ uint32_t push_count; /* How many push threads queued in push phase. */ uint32_t dirty_count; /* How many dirty object in this cache */ uint32_t total_count; /* Count of objects include dirty and clean */ struct hlist_node hash; /* VDI is linked to the global hash lists */ struct rb_root lru_tree; /* For faster object search */ struct list_head lru_head; /* Per VDI LRU list for reclaimer */ struct list_head dirty_head; /* Dirty objects linked to this list */ int push_efd; /* Used to synchronize between pusher and push threads */ struct sd_mutex push_mutex; /* mutex for pushing cache */ struct sd_rw_lock lock; /* Cache lock */ }; struct push_work { struct work work; struct object_cache_entry *entry; struct object_cache *oc; }; static struct global_cache gcache; static char object_cache_dir[PATH_MAX]; static int def_open_flags = O_RDWR; #define HASH_BITS 5 #define HASH_SIZE (1 << HASH_BITS) static struct sd_rw_lock hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = SD_RW_LOCK_INITIALIZER }; static struct hlist_head cache_hashtable[HASH_SIZE]; static int object_cache_push(struct object_cache *oc); static inline bool entry_is_dirty(const struct object_cache_entry *entry) { return !!entry->bmap; } static inline int hash(uint64_t vid) { return hash_64(vid, HASH_BITS); } /* We should always use this helper to get entry idx */ static inline uint64_t entry_idx(const struct object_cache_entry *entry) { return entry->idx & ~CACHE_INDEX_MASK; } static int object_cache_cmp(const struct object_cache_entry *a, const struct object_cache_entry *b) { return intcmp(entry_idx(a), entry_idx(b)); } static inline uint64_t object_cache_oid_to_idx(uint64_t oid) { uint64_t idx = data_oid_to_idx(oid); if (is_vdi_obj(oid)) idx |= 1ULL << CACHE_VDI_SHIFT; else if (is_vdi_btree_obj(oid)) idx |= VDI_BTREE_BIT; return idx; } static inline bool idx_has_vdi_bit(uint64_t idx) { return !!(idx & CACHE_VDI_BIT); } static inline size_t get_cache_block_size(uint64_t oid) { size_t bsize = DIV_ROUND_UP(get_objsize(oid), sizeof(uint64_t) * BITS_PER_BYTE); return round_up(bsize, BLOCK_SIZE); /* To be FS friendly */ } static uint64_t calc_object_bmap(uint64_t oid, size_t len, off_t offset) { int start, end, nr; uint64_t bmap = 0; size_t bsize = get_cache_block_size(oid); start = offset / bsize; end = DIV_ROUND_UP(len + offset, bsize); nr = end - start; while (nr--) set_bit_64(start + nr, &bmap); return bmap; } static inline void get_cache_entry(struct object_cache_entry *entry) { refcount_inc(&entry->refcnt); } static inline void put_cache_entry(struct object_cache_entry *entry) { refcount_dec(&entry->refcnt); } static inline bool entry_in_use(struct object_cache_entry *entry) { return refcount_read(&entry->refcnt) > 0; } /* * Mutual exclusive protection strategy: * * reader and writer: no need to project since it is okay to read * unacked stale data. * reader, writer and pusher: cache lock and entry lock and refcnt. * reader, writer and reclaimer: cache lock and entry refcnt. * pusher and reclaimer: cache lock and entry refcnt. * * entry->bmap is projected by mostly entry lock, sometimes cache lock. * dirty list is projected by cache lock. */ static inline void read_lock_cache(struct object_cache *oc) { sd_read_lock(&oc->lock); } static inline void write_lock_cache(struct object_cache *oc) { sd_write_lock(&oc->lock); } static inline void unlock_cache(struct object_cache *oc) { sd_rw_unlock(&oc->lock); } static inline void read_lock_entry(struct object_cache_entry *entry) { sd_read_lock(&entry->lock); } static inline void write_lock_entry(struct object_cache_entry *entry) { sd_write_lock(&entry->lock); } static inline void unlock_entry(struct object_cache_entry *entry) { sd_rw_unlock(&entry->lock); } static struct object_cache_entry * lru_tree_insert(struct rb_root *root, struct object_cache_entry *new) { return rb_insert(root, new, node, object_cache_cmp); } static struct object_cache_entry *lru_tree_search(struct rb_root *root, uint64_t idx) { struct object_cache_entry key = { .idx = idx }; return rb_search(root, &key, node, object_cache_cmp); } static void do_background_push(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); struct object_cache *oc = pw->oc; if (sd_mutex_trylock(&oc->push_mutex) == EBUSY) return; object_cache_push(oc); sd_mutex_unlock(&oc->push_mutex); } static void background_push_done(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); free(pw); } static void kick_background_pusher(struct object_cache *oc) { struct push_work *pw; pw = xzalloc(sizeof(struct push_work)); pw->oc = oc; pw->work.fn = do_background_push; pw->work.done = background_push_done; queue_work(sys->oc_push_wqueue, &pw->work); } static void del_from_dirty_list(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; list_del(&entry->dirty_list); uatomic_dec(&oc->dirty_count); } static void add_to_dirty_list(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; list_add_tail(&entry->dirty_list, &oc->dirty_head); /* FIXME read sys->status atomically */ if (uatomic_add_return(&oc->dirty_count, 1) > MAX_DIRTY_OBJECT_COUNT && sys->cinfo.status == SD_STATUS_OK) kick_background_pusher(oc); } static inline void free_cache_entry(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; rb_erase(&entry->node, &oc->lru_tree); list_del(&entry->lru_list); oc->total_count--; if (list_linked(&entry->dirty_list)) del_from_dirty_list(entry); sd_destroy_rw_lock(&entry->lock); free(entry); } static uint64_t idx_to_oid(uint32_t vid, uint64_t idx) { if (idx_has_vdi_bit(idx)) return vid_to_vdi_oid(vid); else return vid_to_data_oid(vid, idx); } static int remove_cache_object(struct object_cache *oc, uint64_t idx) { int ret = SD_RES_SUCCESS; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%06"PRIx32"/%016"PRIx64, object_cache_dir, oc->vid, idx); sd_debug("%"PRIx64, idx_to_oid(oc->vid, idx)); if (unlikely(unlink(path) < 0)) { sd_err("failed to remove cached object %m"); if (errno == ENOENT) return SD_RES_SUCCESS; ret = SD_RES_EIO; goto out; } out: return ret; } static int read_cache_object_noupdate(uint32_t vid, uint64_t idx, void *buf, size_t count, off_t offset) { size_t size; int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32"/%016"PRIx64, object_cache_dir, vid, idx); if (sys->object_cache_directio && !idx_has_vdi_bit(idx)) { assert(is_aligned_to_pagesize(buf)); flags |= O_DIRECT; } fd = open(p, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_err("%m"); ret = SD_RES_EIO; goto out; } size = xpread(fd, buf, count, offset); if (unlikely(size != count)) { sd_err("size %zu, count:%zu, offset %jd %m", size, count, (intmax_t)offset); ret = SD_RES_EIO; goto out_close; } out_close: close(fd); out: return ret; } static int write_cache_object_noupdate(uint32_t vid, uint64_t idx, void *buf, size_t count, off_t offset) { size_t size; int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32"/%016"PRIx64, object_cache_dir, vid, idx); if (sys->object_cache_directio && !idx_has_vdi_bit(idx)) { assert(is_aligned_to_pagesize(buf)); flags |= O_DIRECT; } fd = open(p, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_err("%m"); ret = SD_RES_EIO; goto out; } size = xpwrite(fd, buf, count, offset); if (unlikely(size != count)) { sd_err("size %zu, count:%zu, offset %jd %m", size, count, (intmax_t)offset); ret = SD_RES_EIO; goto out_close; } out_close: close(fd); out: return ret; } static int read_cache_object(struct object_cache_entry *entry, void *buf, size_t count, off_t offset) { uint32_t vid = entry->oc->vid; uint64_t idx = entry_idx(entry); struct object_cache *oc = entry->oc; int ret; ret = read_cache_object_noupdate(vid, idx, buf, count, offset); if (ret == SD_RES_SUCCESS) { write_lock_cache(oc); list_move_tail(&entry->lru_list, &oc->lru_head); unlock_cache(oc); } return ret; } static int write_cache_object(struct object_cache_entry *entry, void *buf, size_t count, off_t offset, bool create, bool writeback) { uint32_t vid = entry->oc->vid; uint64_t idx = entry_idx(entry); uint64_t oid = idx_to_oid(vid, idx); struct object_cache *oc = entry->oc; struct sd_req hdr; int ret; write_lock_entry(entry); ret = write_cache_object_noupdate(vid, idx, buf, count, offset); if (ret != SD_RES_SUCCESS) { unlock_entry(entry); return ret; } write_lock_cache(oc); if (writeback) { entry->bmap |= calc_object_bmap(oid, count, offset); if (!list_linked(&entry->dirty_list)) add_to_dirty_list(entry); } list_move_tail(&entry->lru_list, &oc->lru_head); unlock_cache(oc); unlock_entry(entry); if (writeback) goto out; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = count; hdr.obj.oid = oid; hdr.obj.offset = offset; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) { sd_err("failed to write object %" PRIx64 ", %s", oid, sd_strerror(ret)); return ret; } out: return ret; } static int push_cache_object(uint32_t vid, uint64_t idx, uint64_t bmap, bool create) { struct sd_req hdr; void *buf; off_t offset; uint64_t oid = idx_to_oid(vid, idx); size_t data_length, bsize = get_cache_block_size(oid); int ret = SD_RES_NO_MEM; int first_bit, last_bit; if (!bmap) { sd_debug("WARN: nothing to flush %"PRIx64, oid); return SD_RES_SUCCESS; } first_bit = ffsll(bmap) - 1; last_bit = fls64(bmap) - 1; sd_debug("%"PRIx64" bmap(%zd):0x%"PRIx64", first_bit:%d, last_bit:%d", oid, bsize, bmap, first_bit, last_bit); offset = first_bit * bsize; data_length = min((last_bit - first_bit + 1) * bsize, get_objsize(oid) - (size_t)offset); buf = xvalloc(data_length); ret = read_cache_object_noupdate(vid, idx, buf, data_length, offset); if (ret != SD_RES_SUCCESS) goto out; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = data_length; hdr.obj.oid = oid; hdr.obj.offset = offset; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) sd_err("failed to push object %" PRIx64 ", %s", oid, sd_strerror(ret)); out: free(buf); return ret; } /* * The reclaim algorithm is similar to Linux kernel's page cache: * - only tries to reclaim 'clean' object, which doesn't has any dirty updates, * in a LRU list. * - skip the object when it is in R/W operation. * - skip the dirty object if it is not in push(writeback) phase. * - wait on the dirty object if it is in push phase. */ /* * 90% is targeted for a large cache quota such as 200G, then we have 20G * buffer which is large enough to prevent cache overrun. */ #define HIGH_WATERMARK (sys->object_cache_size * 9 / 10) static void do_reclaim_object(struct object_cache *oc) { struct object_cache_entry *entry; uint64_t oid; uint32_t cap; write_lock_cache(oc); list_for_each_entry(entry, &oc->lru_head, lru_list) { oid = idx_to_oid(oc->vid, entry_idx(entry)); if (entry_in_use(entry)) { sd_debug("%"PRIx64" is in use, skip...", oid); continue; } /* * The shared snapshot objects won't be released after being * pulled and if sheep restarts, the remaining snapshot objects * will be marked as dirty. So for these kind of objects, we * can reclaim them safely. */ if (entry_is_dirty(entry) && !oid_is_readonly(oid)) { sd_debug("%"PRIx64" is dirty, skip...", oid); continue; } if (remove_cache_object(oc, entry_idx(entry)) != SD_RES_SUCCESS) continue; free_cache_entry(entry); cap = uatomic_sub_return(&gcache.capacity, CACHE_OBJECT_SIZE); sd_debug("%"PRIx64" reclaimed. capacity:%"PRId32, oid, cap); if (cap <= HIGH_WATERMARK) break; } unlock_cache(oc); } struct reclaim_work { struct work work; int delay; }; static void do_reclaim(struct work *work) { struct reclaim_work *rw = container_of(work, struct reclaim_work, work); struct object_cache *cache; struct hlist_node *node; int i, j; if (rw->delay) sleep(rw->delay); /* We choose a random victim to avoid reclaim the same one every time */ j = random(); for (i = 0; i < HASH_SIZE; i++) { int idx = (i + j) % HASH_SIZE; struct hlist_head *head = cache_hashtable + idx; sd_read_lock(&hashtable_lock[idx]); hlist_for_each_entry(cache, node, head, hash) { uint32_t cap; do_reclaim_object(cache); cap = uatomic_read(&gcache.capacity); if (cap <= HIGH_WATERMARK) { sd_rw_unlock(&hashtable_lock[idx]); sd_debug("complete, capacity %"PRIu32, cap); return; } } sd_rw_unlock(&hashtable_lock[idx]); } sd_debug("finished"); } static void reclaim_done(struct work *work) { struct reclaim_work *rw = container_of(work, struct reclaim_work, work); uatomic_set_false(&gcache.in_reclaim); free(rw); } static int create_dir_for(uint32_t vid) { int ret = 0; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32, object_cache_dir, vid); if (xmkdir(p, sd_def_dmode) < 0) { sd_err("%s, %m", p); ret = -1; } return ret; } static struct object_cache *find_object_cache(uint32_t vid, bool create) { int h = hash(vid); struct hlist_head *head = cache_hashtable + h; struct object_cache *cache = NULL; struct hlist_node *node; if (create) sd_write_lock(&hashtable_lock[h]); else sd_read_lock(&hashtable_lock[h]); if (hlist_empty(head)) goto not_found; hlist_for_each_entry(cache, node, head, hash) { if (cache->vid == vid) goto out; } not_found: if (create) { cache = xzalloc(sizeof(*cache)); cache->vid = vid; INIT_RB_ROOT(&cache->lru_tree); create_dir_for(vid); cache->push_efd = eventfd(0, 0); INIT_LIST_HEAD(&cache->dirty_head); INIT_LIST_HEAD(&cache->lru_head); sd_init_rw_lock(&cache->lock); hlist_add_head(&cache->hash, head); sd_init_mutex(&cache->push_mutex); } else { cache = NULL; } out: sd_rw_unlock(&hashtable_lock[h]); return cache; } static void object_cache_try_to_reclaim(int delay) { struct reclaim_work *rw; if (!sys->object_cache_size) return; if (uatomic_read(&gcache.capacity) < HIGH_WATERMARK) return; if (!uatomic_set_true(&gcache.in_reclaim)) /* the cache is already in reclaim, */ return; rw = xzalloc(sizeof(struct reclaim_work)); rw->delay = delay; rw->work.fn = do_reclaim; rw->work.done = reclaim_done; queue_work(sys->oc_reclaim_wqueue, &rw->work); } static inline struct object_cache_entry * alloc_cache_entry(struct object_cache *oc, uint64_t idx) { struct object_cache_entry *entry; entry = xzalloc(sizeof(*entry)); entry->oc = oc; entry->idx = idx; sd_init_rw_lock(&entry->lock); INIT_LIST_NODE(&entry->dirty_list); INIT_LIST_NODE(&entry->lru_list); return entry; } static void add_to_lru_cache(struct object_cache *oc, uint64_t idx, bool create) { struct object_cache_entry *entry = alloc_cache_entry(oc, idx); sd_debug("oid %"PRIx64" added", idx_to_oid(oc->vid, idx)); write_lock_cache(oc); if (unlikely(lru_tree_insert(&oc->lru_tree, entry))) panic("the object already exist"); uatomic_add(&gcache.capacity, CACHE_OBJECT_SIZE); list_add_tail(&entry->lru_list, &oc->lru_head); oc->total_count++; if (create) { /* Cache lock assure it is not raced with pusher */ entry->bmap = UINT64_MAX; entry->idx |= CACHE_CREATE_BIT; add_to_dirty_list(entry); } unlock_cache(oc); } static inline int lookup_path(char *path) { int ret = SD_RES_SUCCESS; if (access(path, R_OK | W_OK) < 0) { if (unlikely(errno != ENOENT)) { sd_debug("%m"); ret = SD_RES_EIO; } else { ret = SD_RES_NO_CACHE; } } return ret; } static int object_cache_lookup(struct object_cache *oc, uint64_t idx, bool create, bool writeback) { int fd, ret, flags = def_open_flags; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%06"PRIx32"/%016"PRIx64, object_cache_dir, oc->vid, idx); if (!create) return lookup_path(path); flags |= O_CREAT | O_TRUNC; fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_debug("%s, %m", path); ret = SD_RES_EIO; goto out; } ret = prealloc(fd, get_objsize(idx_to_oid(oc->vid, idx))); if (unlikely(ret < 0)) { ret = SD_RES_EIO; goto out_close; } add_to_lru_cache(oc, idx, writeback); object_cache_try_to_reclaim(0); out_close: close(fd); out: return ret; } static int create_cache_object(struct object_cache *oc, uint64_t idx, void *buffer, size_t buf_size) { int flags = def_open_flags | O_CREAT | O_EXCL, fd; int ret = SD_RES_OID_EXIST; char path[PATH_MAX], tmp_path[PATH_MAX]; snprintf(tmp_path, sizeof(tmp_path), "%s/%06"PRIx32"/%016"PRIx64".tmp", object_cache_dir, oc->vid, idx); fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (likely(errno == EEXIST)) { sd_debug("%016"PRIx64" already created", idx); goto out; } sd_debug("%m"); ret = SD_RES_EIO; goto out; } ret = xwrite(fd, buffer, buf_size); if (unlikely(ret != buf_size)) { ret = SD_RES_EIO; sd_err("failed, vid %"PRIx32", idx %"PRIx64, oc->vid, idx); goto out_close; } /* This is intended to take care of partial write due to crash */ snprintf(path, sizeof(path), "%s/%06"PRIx32"/%016"PRIx64, object_cache_dir, oc->vid, idx); ret = link(tmp_path, path); if (unlikely(ret < 0)) { if (errno == EEXIST) { ret = SD_RES_OID_EXIST; goto out_close; } sd_debug("failed to link %s to %s: %m", tmp_path, path); /* FIXME: teach object cache handle EIO gracefully */ ret = SD_RES_EIO; goto out_close; } ret = SD_RES_SUCCESS; sd_debug("%016"PRIx64" size %zu", idx, buf_size); out_close: close(fd); unlink(tmp_path); out: return ret; } /* Fetch the object, cache it in the clean state */ static int object_cache_pull(struct object_cache *oc, uint64_t idx) { struct sd_req hdr; int ret = SD_RES_NO_MEM; uint64_t oid = idx_to_oid(oc->vid, idx); uint32_t data_length = get_objsize(oid); void *buf; buf = xvalloc(data_length); sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.data_length = data_length; hdr.obj.oid = oid; hdr.obj.offset = 0; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) goto err; sd_debug("oid %"PRIx64" pulled successfully", oid); ret = create_cache_object(oc, idx, buf, data_length); /* * We try to delay reclaim objects to avoid object ping-pong * because the pulled object is clean and likely to be reclaimed * in a cache over high watermark. We can't simply pass without * waking up reclaimer because the cache is easy to be filled * full with a read storm. */ switch (ret) { case SD_RES_SUCCESS: add_to_lru_cache(oc, idx, false); object_cache_try_to_reclaim(1); break; case SD_RES_OID_EXIST: ret = SD_RES_SUCCESS; break; default: break; } err: free(buf); return ret; } static void do_push_object(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); struct object_cache_entry *entry = pw->entry; struct object_cache *oc = entry->oc; uint64_t oid = idx_to_oid(oc->vid, entry_idx(entry)); sd_debug("%"PRIx64, oid); read_lock_entry(entry); /* * We might happen to push readonly object in following scenario * 1. sheep pulled some read-only objects * 2. sheep crashed * 3. sheep restarted and marked all the objects in cache dirty blindly */ if (oid_is_readonly(idx_to_oid(oc->vid, entry_idx(entry)))) goto clean; if (unlikely(push_cache_object(oc->vid, entry_idx(entry), entry->bmap, !!(entry->idx & CACHE_CREATE_BIT)) != SD_RES_SUCCESS)) panic("push failed but should never fail"); clean: if (uatomic_sub_return(&oc->push_count, 1) == 0) eventfd_xwrite(oc->push_efd, 1); entry->idx &= ~CACHE_CREATE_BIT; entry->bmap = 0; unlock_entry(entry); sd_debug("%"PRIx64" done", oid); put_cache_entry(entry); } static void push_object_done(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); free(pw); } /* * Push back all the dirty objects before the FLUSH request to sheep replicated * storage synchronously. * * 1. Don't grab cache lock tight so we can serve RW requests while pushing. * It is okay for allow subsequent RW after FLUSH because we only need to * garantee the dirty objects before FLUSH to be pushed. * 2. Use threaded AIO to boost push performance, such as fsync(2) from VM. */ static int object_cache_push(struct object_cache *oc) { struct object_cache_entry *entry; write_lock_cache(oc); if (list_empty(&oc->dirty_head)) { unlock_cache(oc); return SD_RES_SUCCESS; } uatomic_set(&oc->push_count, uatomic_read(&oc->dirty_count)); list_for_each_entry(entry, &oc->dirty_head, dirty_list) { struct push_work *pw; get_cache_entry(entry); pw = xzalloc(sizeof(struct push_work)); pw->work.fn = do_push_object; pw->work.done = push_object_done; pw->entry = entry; queue_work(sys->oc_push_wqueue, &pw->work); del_from_dirty_list(entry); } unlock_cache(oc); eventfd_xread(oc->push_efd); sd_debug("%"PRIx32" completed", oc->vid); return SD_RES_SUCCESS; } bool object_is_cached(uint64_t oid) { uint32_t vid = oid_to_vid(oid); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (!cache) return false; return (object_cache_lookup(cache, idx, 0, false) == SD_RES_SUCCESS); } void object_cache_delete(uint32_t vid) { struct object_cache *cache; int h = hash(vid); struct object_cache_entry *entry; char path[PATH_MAX]; cache = find_object_cache(vid, false); if (!cache) return; /* Firstly we free memeory */ sd_write_lock(&hashtable_lock[h]); hlist_del(&cache->hash); sd_rw_unlock(&hashtable_lock[h]); write_lock_cache(cache); list_for_each_entry(entry, &cache->lru_head, lru_list) { free_cache_entry(entry); uatomic_sub(&gcache.capacity, CACHE_OBJECT_SIZE); } unlock_cache(cache); sd_destroy_rw_lock(&cache->lock); close(cache->push_efd); free(cache); /* Then we free disk */ snprintf(path, sizeof(path), "%s/%06"PRIx32, object_cache_dir, vid); rmdir_r(path); } static struct object_cache_entry * get_cache_entry_from(struct object_cache *cache, uint64_t idx) { struct object_cache_entry *entry; read_lock_cache(cache); entry = lru_tree_search(&cache->lru_tree, idx); if (!entry) { /* The cache entry may be reclaimed, so try again. */ unlock_cache(cache); return NULL; } get_cache_entry(entry); unlock_cache(cache); return entry; } /* This helper increases the refcount */ static struct object_cache_entry *oid_to_entry(uint64_t oid) { uint32_t vid = oid_to_vid(oid); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; cache = find_object_cache(vid, false); entry = get_cache_entry_from(cache, idx); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return NULL; } return entry; } static int object_cache_flush_and_delete(struct object_cache *oc) { DIR *dir; struct dirent *d; uint32_t vid = oc->vid; uint64_t idx; uint64_t all = UINT64_MAX; int ret = 0; char p[PATH_MAX]; sd_debug("%"PRIx32, vid); snprintf(p, sizeof(p), "%s/%06"PRIx32, object_cache_dir, vid); dir = opendir(p); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; if (strcmp(d->d_name + 8, ".tmp") == 0) { sd_debug("try to del %s", d->d_name); if (unlinkat(dirfd(dir), d->d_name, 0) < 0) sd_err("%m"); continue; } idx = strtoull(d->d_name, NULL, 16); if (idx == ULLONG_MAX) continue; if (push_cache_object(vid, idx, all, true) != SD_RES_SUCCESS) { ret = -1; goto out_close_dir; } } object_cache_delete(vid); out_close_dir: closedir(dir); out: return ret; } bool bypass_object_cache(const struct request *req) { uint64_t oid = req->rq.obj.oid; if (!sys->enable_object_cache || req->local) return true; /* For vmstate && vdi_attr object, we don't do caching */ if (is_vmstate_obj(oid) || is_vdi_attr_obj(oid) || req->rq.flags & SD_FLAG_CMD_COW) return true; if (req->rq.flags & SD_FLAG_CMD_DIRECT) { uint32_t vid = oid_to_vid(oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (!cache) return true; if (req->rq.flags & SD_FLAG_CMD_WRITE) { object_cache_flush_and_delete(cache); return true; } else { /* For read requet, we can read cache if any */ uint64_t idx = object_cache_oid_to_idx(oid); if (object_cache_lookup(cache, idx, false, false) == 0) return false; else return true; } } return false; } int object_cache_handle_request(struct request *req) { struct sd_req *hdr = &req->rq; uint64_t oid = req->rq.obj.oid; uint32_t vid = oid_to_vid(oid); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; int ret; bool create = false; sd_debug("%08" PRIx64 ", len %" PRIu32 ", off %" PRIu32, idx, hdr->data_length, hdr->obj.offset); cache = find_object_cache(vid, true); if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ) create = true; retry: ret = object_cache_lookup(cache, idx, create, hdr->flags & SD_FLAG_CMD_CACHE); switch (ret) { case SD_RES_NO_CACHE: ret = object_cache_pull(cache, idx); if (ret != SD_RES_SUCCESS) return ret; break; case SD_RES_EIO: return ret; } entry = get_cache_entry_from(cache, idx); if (!entry) { sd_debug("retry oid %"PRIx64, oid); /* * For the case that object exists but isn't added to object * list yet, we call pthread_yield() to expect other thread can * add object to list ASAP. */ pthread_yield(); goto retry; } if (hdr->flags & SD_FLAG_CMD_WRITE) { ret = write_cache_object(entry, req->data, hdr->data_length, hdr->obj.offset, create, hdr->flags & SD_FLAG_CMD_CACHE); if (ret != SD_RES_SUCCESS) goto err; } else { ret = read_cache_object(entry, req->data, hdr->data_length, hdr->obj.offset); if (ret != SD_RES_SUCCESS) goto err; req->rp.data_length = hdr->data_length; } err: put_cache_entry(entry); return ret; } int object_cache_write(uint64_t oid, char *data, unsigned int datalen, uint64_t offset, bool create) { struct object_cache_entry *entry = oid_to_entry(oid); int ret; sd_debug("%" PRIx64, oid); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return SD_RES_NO_CACHE; } ret = write_cache_object(entry, data, datalen, offset, create, false); put_cache_entry(entry); return ret; } int object_cache_read(uint64_t oid, char *data, unsigned int datalen, uint64_t offset) { struct object_cache_entry *entry = oid_to_entry(oid); int ret; sd_debug("%" PRIx64, oid); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return SD_RES_NO_CACHE; } ret = read_cache_object(entry, data, datalen, offset); put_cache_entry(entry); return ret; } int object_cache_flush_vdi(uint32_t vid) { struct object_cache *cache; int ret; cache = find_object_cache(vid, false); if (!cache) { sd_debug("%"PRIx32" not found", vid); return SD_RES_SUCCESS; } /* * We have to wait for last pusher finishing and push again so * that dirty bits produced while it is waiting are guaranteed * to be pushed back */ sd_mutex_lock(&cache->push_mutex); ret = object_cache_push(cache); sd_mutex_unlock(&cache->push_mutex); return ret; } int object_cache_flush_and_del(const struct request *req) { uint32_t vid = oid_to_vid(req->rq.obj.oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (cache && object_cache_flush_and_delete(cache) < 0) return SD_RES_EIO; return SD_RES_SUCCESS; } static int load_cache_object(struct object_cache *cache) { DIR *dir; struct dirent *d; uint64_t idx; char path[PATH_MAX]; int ret = 0; snprintf(path, sizeof(path), "%s/%06"PRIx32, object_cache_dir, cache->vid); dir = opendir(path); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; if (strcmp(d->d_name + 8, ".tmp") == 0) { sd_debug("try to del %s", d->d_name); if (unlinkat(dirfd(dir), d->d_name, 0) < 0) sd_err("%m"); continue; } idx = strtoull(d->d_name, NULL, 16); if (idx == ULLONG_MAX) continue; /* * We don't know VM's cache type after restarting, so we assume * that it is writeback and mark all the objects diry to avoid * false reclaim. Donot try to reclaim at loading phase becaue * cluster isn't fully working. */ add_to_lru_cache(cache, idx, true); sd_debug("%"PRIx64, idx_to_oid(cache->vid, idx)); } closedir(dir); out: return ret; } static int load_cache(void) { DIR *dir; struct dirent *d; unsigned long vid; char path[PATH_MAX]; int ret = 0; snprintf(path, sizeof(path), "%s", object_cache_dir); dir = opendir(path); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; vid = strtoul(d->d_name, NULL, 16); if (vid == ULONG_MAX) continue; load_cache_object(find_object_cache(vid, true)); } closedir(dir); out: return ret; } int object_cache_remove(uint64_t oid) { /* Inc the entry refcount to exclude the reclaimer */ struct object_cache_entry *entry = oid_to_entry(oid); struct object_cache *oc; int ret; if (!entry) return SD_RES_NO_OBJ; oc = entry->oc; sd_debug("%" PRIx64, oid); while (refcount_read(&entry->refcnt) > 1) usleep(100000); /* Object might be in push */ write_lock_cache(oc); /* * We assume no other thread will inc the refcount of this entry * before we call write_lock_cache(). object_cache_remove() is called * in the DISCARD context, which means nornamly no other read/write * requests. */ assert(refcount_read(&entry->refcnt) == 1); ret = remove_cache_object(oc, entry_idx(entry)); if (ret != SD_RES_SUCCESS) { unlock_cache(oc); return ret; } free_cache_entry(entry); unlock_cache(oc); uatomic_sub(&gcache.capacity, CACHE_OBJECT_SIZE); return SD_RES_SUCCESS; } int object_cache_init(const char *p) { int ret = 0; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, p); if (xmkdir(buf.buf, sd_def_dmode) < 0) { sd_err("%s %m", buf.buf); ret = -1; goto err; } strbuf_addstr(&buf, "/cache"); if (xmkdir(buf.buf, sd_def_dmode) < 0) { sd_err("%s %m", buf.buf); ret = -1; goto err; } strbuf_copyout(&buf, object_cache_dir, sizeof(object_cache_dir)); uatomic_set(&gcache.capacity, 0); uatomic_set_false(&gcache.in_reclaim); ret = load_cache(); err: strbuf_release(&buf); return ret; } void object_cache_format(void) { struct object_cache *cache; struct hlist_node *node; int i; for (i = 0; i < HASH_SIZE; i++) { struct hlist_head *head = cache_hashtable + i; hlist_for_each_entry(cache, node, head, hash) { object_cache_delete(cache->vid); } } uatomic_set(&gcache.capacity, 0); } int object_cache_get_info(struct object_cache_info *info) { int j = 0; memset(info, 0, sizeof(*info)); info->used = (uint64_t)gcache.capacity * 1024 * 1024; info->size = (uint64_t)sys->object_cache_size * 1024 * 1024; for (int i = 0; i < HASH_SIZE; i++) { struct hlist_head *head = cache_hashtable + i; struct object_cache *cache; struct hlist_node *node; sd_read_lock(&hashtable_lock[i]); hlist_for_each_entry(cache, node, head, hash) { read_lock_cache(cache); info->caches[j].vid = cache->vid; info->caches[j].dirty = cache->dirty_count; info->caches[j].total = cache->total_count; j++; unlock_cache(cache); } sd_rw_unlock(&hashtable_lock[i]); } info->count = j; info->directio = sys->object_cache_directio; return sizeof(*info); } sheepdog-0.8.3/sheep/object_list_cache.c000066400000000000000000000121441237656255000202310ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Levin Li * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct objlist_cache_entry { uint64_t oid; struct rb_node node; }; struct objlist_cache { int tree_version; int buf_version; int cache_size; uint64_t *buf; struct rb_root root; struct sd_rw_lock lock; }; struct objlist_deletion_work { uint32_t vid; struct work work; }; static struct objlist_cache obj_list_cache = { .tree_version = 1, .root = RB_ROOT, .lock = SD_RW_LOCK_INITIALIZER, }; static int objlist_cache_cmp(const struct objlist_cache_entry *a, const struct objlist_cache_entry *b) { return intcmp(a->oid, b->oid); } static struct objlist_cache_entry *objlist_cache_rb_insert(struct rb_root *root, struct objlist_cache_entry *new) { return rb_insert(root, new, node, objlist_cache_cmp); } static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid) { struct objlist_cache_entry *entry, key = { .oid = oid }; entry = rb_search(root, &key, node, objlist_cache_cmp); if (!entry) return -1; rb_erase(&entry->node, root); free(entry); return 0; } void objlist_cache_remove(uint64_t oid) { sd_write_lock(&obj_list_cache.lock); if (!objlist_cache_rb_remove(&obj_list_cache.root, oid)) { obj_list_cache.cache_size--; obj_list_cache.tree_version++; } sd_rw_unlock(&obj_list_cache.lock); } int objlist_cache_insert(uint64_t oid) { struct objlist_cache_entry *entry, *p; entry = xzalloc(sizeof(*entry)); entry->oid = oid; rb_init_node(&entry->node); sd_write_lock(&obj_list_cache.lock); p = objlist_cache_rb_insert(&obj_list_cache.root, entry); if (p) free(entry); else { obj_list_cache.cache_size++; obj_list_cache.tree_version++; } sd_rw_unlock(&obj_list_cache.lock); return 0; } int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data) { int nr = 0; struct objlist_cache_entry *entry; /* first try getting the cached buffer with only a read lock held */ sd_read_lock(&obj_list_cache.lock); if (obj_list_cache.tree_version == obj_list_cache.buf_version) goto out; /* if that fails grab a write lock for the usually nessecary update */ sd_rw_unlock(&obj_list_cache.lock); sd_write_lock(&obj_list_cache.lock); if (obj_list_cache.tree_version == obj_list_cache.buf_version) goto out; obj_list_cache.buf_version = obj_list_cache.tree_version; obj_list_cache.buf = xrealloc(obj_list_cache.buf, obj_list_cache.cache_size * sizeof(uint64_t)); rb_for_each_entry(entry, &obj_list_cache.root, node) { obj_list_cache.buf[nr++] = entry->oid; } out: if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) { sd_rw_unlock(&obj_list_cache.lock); sd_err("GET_OBJ_LIST buffer too small"); return SD_RES_BUFFER_SMALL; } rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t); memcpy(data, obj_list_cache.buf, rsp->data_length); sd_rw_unlock(&obj_list_cache.lock); return SD_RES_SUCCESS; } static void objlist_deletion_work(struct work *work) { struct objlist_deletion_work *ow = container_of(work, struct objlist_deletion_work, work); struct objlist_cache_entry *entry; uint32_t vid = ow->vid, entry_vid; /* * Before reclaiming the cache belonging to the VDI just deleted, * we should test whether the VDI is exist, because after some node * deleting it and before the notification is sent to all the node, * another node may issus a VDI creation event and reused the VDI id * again, in which case we should not reclaim the cached entry. */ if (vdi_exist(vid)) { sd_debug("VDI (%" PRIx32 ") is still in use, can not be" " deleted", vid); return; } sd_write_lock(&obj_list_cache.lock); rb_for_each_entry(entry, &obj_list_cache.root, node) { entry_vid = oid_to_vid(entry->oid); if (entry_vid != vid) continue; /* VDI objects cannot be removed even after we delete images. */ if (is_vdi_obj(entry->oid)) continue; sd_debug("delete object entry %" PRIx64, entry->oid); rb_erase(&entry->node, &obj_list_cache.root); free(entry); } sd_rw_unlock(&obj_list_cache.lock); } static void objlist_deletion_done(struct work *work) { struct objlist_deletion_work *ow = container_of(work, struct objlist_deletion_work, work); free(ow); } /* * During recovery, some objects may be migrated from one node to a * new one, but we can't remove the object list cache entry in this * case, it may causes recovery failure, so after recovery, we can * not locate the cache entry correctly, causing objlist_cache_remove() * fail to delete it, then we need this function to do the cleanup work * in all nodes. */ int objlist_cache_cleanup(uint32_t vid) { struct objlist_deletion_work *ow; ow = xzalloc(sizeof(*ow)); ow->vid = vid; ow->work.fn = objlist_deletion_work; ow->work.done = objlist_deletion_done; queue_work(sys->deletion_wqueue, &ow->work); return SD_RES_SUCCESS; } sheepdog-0.8.3/sheep/ops.c000066400000000000000000001010151237656255000154020ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #include "trace/trace.h" enum sd_op_type { SD_OP_TYPE_CLUSTER = 1, /* cluster operations */ SD_OP_TYPE_LOCAL, /* local operations */ SD_OP_TYPE_PEER, /* io operations */ SD_OP_TYPE_GATEWAY, /* gateway operations */ }; struct sd_op_template { const char *name; enum sd_op_type type; /* process request even when cluster is not working */ bool force; /* * Indicates administrative operation to trace. * If true is set, rx_main and tx_main log operations at info level. */ bool is_admin_op; /* * process_work() will be called in a worker thread, and process_main() * will be called in the main thread. * * If type is SD_OP_TYPE_CLUSTER, it is guaranteed that only one node * processes a cluster operation at the same time. We can use this for * for example to implement distributed locking. process_work() * will be called on the local node, and process_main() will be called * on every node. * * If type is SD_OP_TYPE_LOCAL, both process_work() and process_main() * will be called on the local node. * * If type is SD_OP_TYPE_PEER, only process_work() will be called, and it * will be called on the local node. */ int (*process_work)(struct request *req); int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data); }; /* * The last gathered epoch is the epoch at which all the nodes complete the * recovery and purge the stale objects. */ uint32_t last_gathered_epoch = 1; static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch) { uint64_t used; if (sys->gateway_only) { *store_size = 0; *store_free = 0; } else { *store_size = md_get_size(&used); *store_free = *store_size - used; } return SD_RES_SUCCESS; } static int cluster_new_vdi(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t vid; int ret; struct timeval tv; gettimeofday(&tv, NULL); struct vdi_iocb iocb = { .name = req->data, .data_len = hdr->data_length, .size = hdr->vdi.vdi_size, .base_vid = hdr->vdi.base_vdi_id, .create_snapshot = !!hdr->vdi.snapid, .copy_policy = hdr->vdi.copy_policy, .store_policy = hdr->vdi.store_policy, .nr_copies = hdr->vdi.copies, .time = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000, }; /* Client doesn't specify redundancy scheme (copy = 0) */ if (!hdr->vdi.copies) { iocb.nr_copies = sys->cinfo.nr_copies; iocb.copy_policy = sys->cinfo.copy_policy; } if (iocb.copy_policy) iocb.nr_copies = ec_policy_to_dp(iocb.copy_policy, NULL, NULL); if (hdr->data_length != SD_MAX_VDI_LEN) return SD_RES_INVALID_PARMS; if (iocb.create_snapshot) ret = vdi_snapshot(&iocb, &vid); else ret = vdi_create(&iocb, &vid); rsp->vdi.vdi_id = vid; rsp->vdi.copies = iocb.nr_copies; return ret; } static int post_cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) { unsigned long nr = rsp->vdi.vdi_id; int ret = rsp->result; sd_debug("done %d %lx", ret, nr); if (ret == SD_RES_SUCCESS) atomic_set_bit(nr, sys->vdi_inuse); return ret; } static int vdi_init_tag(const char **tag, const char *buf, uint32_t len) { if (len == SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN) *tag = buf + SD_MAX_VDI_LEN; else if (len == SD_MAX_VDI_LEN) *tag = NULL; else return -1; return 0; } static int cluster_del_vdi(struct request *req) { const struct sd_req *hdr = &req->rq; uint32_t data_len = hdr->data_length; struct vdi_iocb iocb = { .name = req->data, .data_len = data_len, .snapid = hdr->vdi.snapid, }; if (vdi_init_tag(&iocb.tag, req->data, data_len) < 0) return SD_RES_INVALID_PARMS; return vdi_delete(&iocb, req); } struct cache_deletion_work { uint32_t vid; struct work work; }; static void cache_delete_work(struct work *work) { struct cache_deletion_work *dw = container_of(work, struct cache_deletion_work, work); object_cache_delete(dw->vid); } static void cache_delete_done(struct work *work) { struct cache_deletion_work *dw = container_of(work, struct cache_deletion_work, work); free(dw); } static int post_cluster_del_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) { unsigned long vid = rsp->vdi.vdi_id; struct cache_deletion_work *dw; int ret = rsp->result; if (!sys->enable_object_cache) return ret; dw = xzalloc(sizeof(*dw)); dw->vid = vid; dw->work.fn = cache_delete_work; dw->work.done = cache_delete_done; queue_work(sys->deletion_wqueue, &dw->work); return ret; } /* * Look up vid and copy number from vdi name * * This must be a cluster operation. If QEMU reads the vdi object * while sheep snapshots the vdi, sheep can return SD_RES_NO_VDI. To * avoid this problem, SD_OP_GET_INFO must be ordered with * SD_OP_NEW_VDI. */ static int cluster_get_vdi_info(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t data_len = hdr->data_length; int ret; struct vdi_info info = {}; struct vdi_iocb iocb = { .name = req->data, .data_len = data_len, .snapid = hdr->vdi.snapid, }; if (vdi_init_tag(&iocb.tag, req->data, data_len) < 0) return SD_RES_INVALID_PARMS; ret = vdi_lookup(&iocb, &info); if (ret != SD_RES_SUCCESS) return ret; rsp->vdi.vdi_id = info.vid; rsp->vdi.copies = get_vdi_copy_number(info.vid); return ret; } static int remove_epoch(uint32_t epoch) { int ret; char path[PATH_MAX]; sd_debug("remove epoch %"PRIu32, epoch); snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); ret = unlink(path); if (ret && errno != ENOENT) { sd_err("failed to remove %s: %m", path); return SD_RES_EIO; } return SD_RES_SUCCESS; } static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int i, ret; uint32_t latest_epoch; struct store_driver *driver; char *store_name = data; driver = find_store_driver(data); if (!driver) return SD_RES_NO_STORE; pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store), store_name); sd_store = driver; latest_epoch = get_latest_epoch(); ret = sd_store->format(); if (ret != SD_RES_SUCCESS) return ret; ret = sd_store->init(); if (ret != SD_RES_SUCCESS) return ret; sys->cinfo.nr_copies = req->cluster.copies; sys->cinfo.copy_policy = req->cluster.copy_policy; sys->cinfo.flags = req->cluster.flags; if (!sys->cinfo.nr_copies) sys->cinfo.nr_copies = SD_DEFAULT_COPIES; sys->cinfo.ctime = req->cluster.ctime; set_cluster_config(&sys->cinfo); for (i = 1; i <= latest_epoch; i++) remove_epoch(i); memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse)); clean_vdi_state(); sys->cinfo.epoch = 0; ret = inc_and_log_epoch(); if (ret) return SD_RES_EIO; sys->cinfo.status = SD_STATUS_OK; return SD_RES_SUCCESS; } static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.status = SD_STATUS_SHUTDOWN; if (!node_in_recovery()) { unregister_listening_fds(); if (set_cluster_shutdown(true) != SD_RES_SUCCESS) /* * It's okay we failed to set 'shutdown', just start * recovery after restart blindly. */ sd_err("failed to set cluster as shutdown"); } return SD_RES_SUCCESS; } static int cluster_enable_recover(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.disable_recovery = false; resume_suspended_recovery(); return SD_RES_SUCCESS; } static int cluster_disable_recover(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.disable_recovery = true; return SD_RES_SUCCESS; } static int cluster_get_vdi_attr(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t vid, attrid = 0; struct sheepdog_vdi_attr *vattr; struct vdi_iocb iocb = {}; struct vdi_info info = {}; int ret; vattr = req->data; iocb.name = vattr->name; iocb.tag = vattr->tag; iocb.snapid = hdr->vdi.snapid; ret = vdi_lookup(&iocb, &info); if (ret != SD_RES_SUCCESS) return ret; /* * the current VDI id can change if we take a snapshot, * so we use the hash value of the VDI name as the VDI id */ vid = sd_hash_vdi(vattr->name); ret = get_vdi_attr(req->data, hdr->data_length, vid, &attrid, info.create_time, !!(hdr->flags & SD_FLAG_CMD_CREAT), !!(hdr->flags & SD_FLAG_CMD_EXCL), !!(hdr->flags & SD_FLAG_CMD_DEL)); rsp->vdi.vdi_id = vid; rsp->vdi.attr_id = attrid; rsp->vdi.copies = get_vdi_copy_number(vid); return ret; } static int local_release_vdi(struct request *req) { uint32_t vid = req->rq.vdi.base_vdi_id; int ret; if (!sys->enable_object_cache) return SD_RES_SUCCESS; if (!vid) { sd_info("Some VDI failed to release the object cache. " "Probably you are running old QEMU."); return SD_RES_SUCCESS; } ret = object_cache_flush_vdi(vid); if (ret == SD_RES_SUCCESS) object_cache_delete(vid); return ret; } static int local_get_store_list(struct request *req) { struct strbuf buf = STRBUF_INIT; struct store_driver *driver; list_for_each_entry(driver, &store_drivers, list) { strbuf_addf(&buf, "%s ", driver->name); } req->rp.data_length = strbuf_copyout(&buf, req->data, req->data_length); strbuf_release(&buf); return SD_RES_SUCCESS; } static int local_read_vdis(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return read_vdis(data, req->data_length, &rsp->data_length); } static int local_get_vdi_copies(const struct sd_req *req, struct sd_rsp *rsp, void *data) { rsp->data_length = fill_vdi_state_list(data); return SD_RES_SUCCESS; } static int local_stat_sheep(struct request *req) { struct sd_rsp *rsp = &req->rp; uint32_t epoch = req->rq.epoch; return stat_sheep(&rsp->node.store_size, &rsp->node.store_free, epoch); } static int local_stat_recovery(const struct sd_req *req, struct sd_rsp *rsp, void *data) { get_recovery_state(data); rsp->data_length = sizeof(struct recovery_state); return SD_RES_SUCCESS; } static int local_stat_cluster(struct request *req) { struct sd_rsp *rsp = &req->rp; struct epoch_log *elog; int i, max_elogs; uint32_t epoch; if (req->vinfo == NULL) { sd_debug("cluster is not started up"); goto out; } max_elogs = req->rq.data_length / sizeof(*elog); epoch = get_latest_epoch(); for (i = 0; i < max_elogs; i++) { int nr_nodes; if (epoch <= 0) break; elog = (struct epoch_log *)req->data + i; memset(elog, 0, sizeof(*elog)); /* some filed only need to store in first elog */ if (i == 0) { elog->ctime = sys->cinfo.ctime; elog->disable_recovery = sys->cinfo.disable_recovery; elog->nr_copies = sys->cinfo.nr_copies; elog->copy_policy = sys->cinfo.copy_policy; strncpy(elog->drv_name, (char *)sys->cinfo.store, STORE_LEN); } elog->epoch = epoch; nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes, sizeof(elog->nodes), (time_t *)&elog->time); if (nr_nodes == -1) nr_nodes = epoch_log_read_remote(epoch, elog->nodes, sizeof(elog->nodes), (time_t *)&elog->time, req->vinfo); assert(nr_nodes >= 0); assert(nr_nodes <= SD_MAX_NODES); elog->nr_nodes = nr_nodes; rsp->data_length += sizeof(*elog); epoch--; } out: switch (sys->cinfo.status) { case SD_STATUS_OK: return SD_RES_SUCCESS; case SD_STATUS_WAIT: if (sys->cinfo.ctime == 0) return SD_RES_WAIT_FOR_FORMAT; else return SD_RES_WAIT_FOR_JOIN; case SD_STATUS_SHUTDOWN: return SD_RES_SHUTDOWN; default: return SD_RES_SYSTEM_ERROR; } } static int local_get_obj_list(struct request *req) { return get_obj_list(&req->rq, &req->rp, req->data); } static int local_get_epoch(struct request *req) { uint32_t epoch = req->rq.obj.tgt_epoch; int nr_nodes, nodes_len; time_t timestamp; sd_debug("%d", epoch); nr_nodes = epoch_log_read_with_timestamp(epoch, req->data, req->rq.data_length - sizeof(timestamp), ×tamp); if (nr_nodes == -1) return SD_RES_NO_TAG; nodes_len = nr_nodes * sizeof(struct sd_node); memcpy((void *)((char *)req->data + nodes_len), ×tamp, sizeof(timestamp)); req->rp.data_length = nodes_len + sizeof(time_t); return SD_RES_SUCCESS; } static int cluster_force_recover_work(struct request *req) { struct vnode_info *old_vnode_info; uint32_t epoch = sys_epoch(); /* * We should manually recover the cluster when * 1) the master is physically down (different epoch condition). * 2) some nodes are physically down (same epoch condition). * In both case, the nodes(s) stat is WAIT_FOR_JOIN. */ if (sys->cinfo.status != SD_STATUS_WAIT || req->vinfo == NULL) return SD_RES_FORCE_RECOVER; old_vnode_info = get_vnode_info_epoch(epoch, req->vinfo); if (!old_vnode_info) { sd_emerg("cannot get vnode info for epoch %d", epoch); put_vnode_info(old_vnode_info); return SD_RES_FORCE_RECOVER; } if (req->rq.data_length < sizeof(struct sd_node) * old_vnode_info->nr_nodes) { sd_err("too small buffer size, %d", req->rq.data_length); return SD_RES_INVALID_PARMS; } req->rp.epoch = epoch; req->rp.data_length = sizeof(struct sd_node) * old_vnode_info->nr_nodes; nodes_to_buffer(&old_vnode_info->nroot, req->data); put_vnode_info(old_vnode_info); return SD_RES_SUCCESS; } static int cluster_force_recover_main(const struct sd_req *req, struct sd_rsp *rsp, void *data) { struct vnode_info *old_vnode_info, *vnode_info; int ret = SD_RES_SUCCESS; struct sd_node *nodes = data; size_t nr_nodes = rsp->data_length / sizeof(*nodes); struct rb_root nroot = RB_ROOT; if (rsp->epoch != sys->cinfo.epoch) { sd_err("epoch was incremented while cluster_force_recover"); return SD_RES_FORCE_RECOVER; } ret = inc_and_log_epoch(); if (ret) { sd_emerg("cannot update epoch log"); goto err; } if (!is_cluster_formatted()) /* initialize config file */ set_cluster_config(&sys->cinfo); sys->cinfo.status = SD_STATUS_OK; for (int i = 0; i < nr_nodes; i++) rb_insert(&nroot, &nodes[i], rb, node_cmp); vnode_info = get_vnode_info(); old_vnode_info = alloc_vnode_info(&nroot); start_recovery(vnode_info, old_vnode_info, true); put_vnode_info(vnode_info); put_vnode_info(old_vnode_info); return ret; err: panic("failed in force recovery"); } static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int ret; if (node_in_recovery()) return SD_RES_NODE_IN_RECOVERY; if (sys->gateway_only) return SD_RES_SUCCESS; if (sd_store->cleanup) ret = sd_store->cleanup(); else ret = SD_RES_NO_SUPPORT; return ret; } static int cluster_notify_vdi_add(const struct sd_req *req, struct sd_rsp *rsp, void *data) { if (req->vdi_state.old_vid) /* make the previous working vdi a snapshot */ add_vdi_state(req->vdi_state.old_vid, get_vdi_copy_number(req->vdi_state.old_vid), true, req->vdi_state.copy_policy); if (req->vdi_state.set_bitmap) atomic_set_bit(req->vdi_state.new_vid, sys->vdi_inuse); add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false, req->vdi_state.copy_policy); return SD_RES_SUCCESS; } static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp, void *data) { uint32_t vid = *(uint32_t *)data; return objlist_cache_cleanup(vid); } static int cluster_delete_cache(const struct sd_req *req, struct sd_rsp *rsp, void *data) { uint32_t vid = oid_to_vid(req->obj.oid); if (sys->enable_object_cache) object_cache_delete(vid); return SD_RES_SUCCESS; } static int cluster_recovery_completion(const struct sd_req *req, struct sd_rsp *rsp, void *data) { static struct sd_node recovereds[SD_MAX_NODES], *node; static size_t nr_recovereds; static int latest_epoch; struct vnode_info *vnode_info; int i; uint32_t epoch = req->obj.tgt_epoch; node = (struct sd_node *)data; if (latest_epoch > epoch) return SD_RES_SUCCESS; if (latest_epoch < epoch) { sd_debug("new epoch %d", epoch); latest_epoch = epoch; nr_recovereds = 0; } recovereds[nr_recovereds++] = *node; xqsort(recovereds, nr_recovereds, node_cmp); sd_debug("%s is recovered at epoch %d", node_to_str(node), epoch); for (i = 0; i < nr_recovereds; i++) sd_debug("[%x] %s", i, node_to_str(recovereds + i)); if (sys->cinfo.epoch != latest_epoch) return SD_RES_SUCCESS; vnode_info = get_vnode_info(); if (vnode_info->nr_nodes == nr_recovereds) { for (i = 0; i < nr_recovereds; ++i) { if (!rb_search(&vnode_info->nroot, &recovereds[i], rb, node_cmp)) break; } if (i == nr_recovereds) { sd_notice("all nodes are recovered, epoch %d", epoch); last_gathered_epoch = epoch; /* sd_store can be NULL if this node is a gateway */ if (vnode_info->nr_zones >= ec_max_data_strip && sd_store && sd_store->cleanup) sd_store->cleanup(); } } put_vnode_info(vnode_info); return SD_RES_SUCCESS; } static bool node_size_varied(void) { uint64_t new, used, old = sys->this_node.space; double diff; if (sys->gateway_only) return false; new = md_get_size(&used); /* If !old, it is forced-out-gateway. Not supported by current node */ if (!old) { if (new) return true; else return false; } diff = new > old ? (double)(new - old) : (double)(old - new); sd_debug("new %"PRIu64 ", old %"PRIu64", ratio %f", new, old, diff / (double)old); if (diff / (double)old < 0.01) return false; sys->this_node.space = new; set_node_space(new); return true; } static int cluster_reweight(const struct sd_req *req, struct sd_rsp *rsp, void *data) { if (node_size_varied()) return sys->cdrv->update_node(&sys->this_node); return SD_RES_SUCCESS; } static int local_md_info(struct request *request) { struct sd_rsp *rsp = &request->rp; assert(request->rq.data_length == sizeof(struct sd_md_info)); rsp->data_length = md_get_info((struct sd_md_info *)request->data); return rsp->data_length ? SD_RES_SUCCESS : SD_RES_UNKNOWN; } static int local_md_plug(const struct sd_req *req, struct sd_rsp *rsp, void *data) { char *disks = (char *)data; return md_plug_disks(disks); } static int local_md_unplug(const struct sd_req *req, struct sd_rsp *rsp, void *data) { char *disks = (char *)data; return md_unplug_disks(disks); } static int local_get_hash(struct request *request) { struct sd_req *req = &request->rq; struct sd_rsp *rsp = &request->rp; if (!sd_store->get_hash) return SD_RES_NO_SUPPORT; return sd_store->get_hash(req->obj.oid, req->obj.tgt_epoch, rsp->hash.digest); } static int local_get_cache_info(struct request *request) { struct sd_rsp *rsp = &request->rp; assert(request->rq.data_length == sizeof(struct object_cache_info)); rsp->data_length = object_cache_get_info((struct object_cache_info *) request->data); return SD_RES_SUCCESS; } static int local_cache_purge(struct request *req) { const struct sd_req *hdr = &req->rq; uint32_t vid = oid_to_vid(req->rq.obj.oid); if (hdr->flags == SD_FLAG_CMD_WRITE) { object_cache_delete(vid); goto out; } object_cache_format(); out: return SD_RES_SUCCESS; } static int local_sd_stat(const struct sd_req *req, struct sd_rsp *rsp, void *data) { memcpy(data, &sys->stat, sizeof(struct sd_stat)); rsp->data_length = sizeof(struct sd_stat); return SD_RES_SUCCESS; } /* Return SD_RES_INVALID_PARMS to ask client not to send flush req again */ static int local_flush_vdi(struct request *req) { int ret = SD_RES_INVALID_PARMS; if (sys->enable_object_cache) { uint32_t vid = oid_to_vid(req->rq.obj.oid); ret = object_cache_flush_vdi(vid); } return ret; } static int local_discard_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; uint32_t vid = oid_to_vid(oid), tmp_vid; int ret = SD_RES_SUCCESS, idx = data_oid_to_idx(oid); struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); sd_debug("%"PRIx64, oid); ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, sizeof(struct sd_inode), 0); if (ret != SD_RES_SUCCESS) goto out; tmp_vid = INODE_GET_VID(inode, idx); /* if vid in idx is not exist, we don't need to remove it */ if (tmp_vid) { INODE_SET_VID(inode, idx, 0); ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, vid, 0, 0, false, false); if (ret != SD_RES_SUCCESS) goto out; if (sd_remove_object(oid) != SD_RES_SUCCESS) sd_err("failed to remove %"PRIx64, oid); } /* * Return success even if sd_remove_object fails because we have updated * inode successfully. */ out: free(inode); return ret; } static int local_flush_and_del(struct request *req) { if (!sys->enable_object_cache) return SD_RES_SUCCESS; return object_cache_flush_and_del(req); } static int local_trace_enable(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return trace_enable(data); } static int local_trace_disable(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return trace_disable(data); } static int local_trace_status(const struct sd_req *req, struct sd_rsp *rsp, void *data) { rsp->data_length = trace_status(data); return SD_RES_SUCCESS; } static int local_trace_read_buf(struct request *request) { struct sd_req *req = &request->rq; struct sd_rsp *rsp = &request->rp; int ret; ret = trace_buffer_pop(request->data, req->data_length); if (ret == -1) return SD_RES_AGAIN; rsp->data_length = ret; sd_debug("%u", rsp->data_length); return SD_RES_SUCCESS; } static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.status = SD_STATUS_KILLED; unregister_listening_fds(); return SD_RES_SUCCESS; } static int peer_remove_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; uint8_t ec_index = req->rq.obj.ec_index; objlist_cache_remove(oid); return sd_store->remove_object(oid, ec_index); } int peer_read_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; int ret; uint32_t epoch = hdr->epoch; struct siocb iocb; if (sys->gateway_only) return SD_RES_NO_OBJ; memset(&iocb, 0, sizeof(iocb)); iocb.epoch = epoch; iocb.buf = req->data; iocb.length = hdr->data_length; iocb.offset = hdr->obj.offset; iocb.ec_index = hdr->obj.ec_index; iocb.copy_policy = hdr->obj.copy_policy; ret = sd_store->read(hdr->obj.oid, &iocb); if (ret != SD_RES_SUCCESS) goto out; rsp->data_length = hdr->data_length; out: return ret; } static int peer_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct siocb iocb = { }; uint64_t oid = hdr->obj.oid; iocb.epoch = hdr->epoch; iocb.buf = req->data; iocb.length = hdr->data_length; iocb.offset = hdr->obj.offset; iocb.ec_index = hdr->obj.ec_index; iocb.copy_policy = hdr->obj.copy_policy; return sd_store->write(oid, &iocb); } static int peer_create_and_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct siocb iocb = { }; iocb.epoch = hdr->epoch; iocb.buf = req->data; iocb.length = hdr->data_length; iocb.ec_index = hdr->obj.ec_index; iocb.copy_policy = hdr->obj.copy_policy; iocb.offset = hdr->obj.offset; return sd_store->create_and_write(hdr->obj.oid, &iocb); } static int local_get_loglevel(struct request *req) { int32_t current_level; current_level = get_loglevel(); memcpy(req->data, ¤t_level, sizeof(current_level)); req->rp.data_length = sizeof(current_level); sd_info("returning log level: %u", current_level); return SD_RES_SUCCESS; } static int local_set_loglevel(struct request *req) { int32_t new_level = 0; memcpy(&new_level, req->data, sizeof(int32_t)); if (!(LOG_EMERG <= new_level && new_level <= LOG_DEBUG)) { sd_err("invalid log level: %d", new_level); return SD_RES_INVALID_PARMS; } set_loglevel(new_level); return SD_RES_SUCCESS; } static struct sd_op_template sd_ops[] = { /* cluster operations */ [SD_OP_NEW_VDI] = { .name = "NEW_VDI", .type = SD_OP_TYPE_CLUSTER, .is_admin_op = true, .process_work = cluster_new_vdi, .process_main = post_cluster_new_vdi, }, [SD_OP_DEL_VDI] = { .name = "DEL_VDI", .type = SD_OP_TYPE_CLUSTER, .is_admin_op = true, .process_work = cluster_del_vdi, .process_main = post_cluster_del_vdi, }, [SD_OP_MAKE_FS] = { .name = "MAKE_FS", .type = SD_OP_TYPE_CLUSTER, .force = true, .is_admin_op = true, .process_main = cluster_make_fs, }, [SD_OP_SHUTDOWN] = { .name = "SHUTDOWN", .type = SD_OP_TYPE_CLUSTER, .force = true, .is_admin_op = true, .process_main = cluster_shutdown, }, [SD_OP_GET_VDI_ATTR] = { .name = "GET_VDI_ATTR", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_attr, }, [SD_OP_FORCE_RECOVER] = { .name = "FORCE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .force = true, .is_admin_op = true, .process_work = cluster_force_recover_work, .process_main = cluster_force_recover_main, }, [SD_OP_CLEANUP] = { .name = "CLEANUP", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_cleanup, }, [SD_OP_NOTIFY_VDI_DEL] = { .name = "NOTIFY_VDI_DEL", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_notify_vdi_del, }, [SD_OP_NOTIFY_VDI_ADD] = { .name = "NOTIFY_VDI_ADD", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_notify_vdi_add, }, [SD_OP_DELETE_CACHE] = { .name = "DELETE_CACHE", .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_delete_cache, }, [SD_OP_COMPLETE_RECOVERY] = { .name = "COMPLETE_RECOVERY", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_recovery_completion, }, [SD_OP_GET_VDI_INFO] = { .name = "GET_VDI_INFO", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_info, }, [SD_OP_LOCK_VDI] = { .name = "LOCK_VDI", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_info, }, [SD_OP_REWEIGHT] = { .name = "REWEIGHT", .type = SD_OP_TYPE_CLUSTER, .is_admin_op = true, .process_main = cluster_reweight, }, [SD_OP_ENABLE_RECOVER] = { .name = "ENABLE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .is_admin_op = true, .process_main = cluster_enable_recover, }, [SD_OP_DISABLE_RECOVER] = { .name = "DISABLE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .is_admin_op = true, .process_main = cluster_disable_recover, }, /* local operations */ [SD_OP_RELEASE_VDI] = { .name = "RELEASE_VDI", .type = SD_OP_TYPE_LOCAL, .process_work = local_release_vdi, }, [SD_OP_GET_STORE_LIST] = { .name = "GET_STORE_LIST", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_get_store_list, }, [SD_OP_READ_VDIS] = { .name = "READ_VDIS", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_read_vdis, }, [SD_OP_GET_VDI_COPIES] = { .name = "GET_VDI_COPIES", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_get_vdi_copies, }, [SD_OP_GET_NODE_LIST] = { .name = "GET_NODE_LIST", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_get_node_list, }, [SD_OP_STAT_SHEEP] = { .name = "STAT_SHEEP", .type = SD_OP_TYPE_LOCAL, .process_work = local_stat_sheep, }, [SD_OP_STAT_RECOVERY] = { .name = "STAT_RECOVERY", .type = SD_OP_TYPE_LOCAL, .process_main = local_stat_recovery, }, [SD_OP_STAT_CLUSTER] = { .name = "STAT_CLUSTER", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_stat_cluster, }, [SD_OP_GET_OBJ_LIST] = { .name = "GET_OBJ_LIST", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_obj_list, }, [SD_OP_GET_EPOCH] = { .name = "GET_EPOCH", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_epoch, }, [SD_OP_FLUSH_VDI] = { .name = "FLUSH_VDI", .type = SD_OP_TYPE_LOCAL, .process_work = local_flush_vdi, }, [SD_OP_DISCARD_OBJ] = { .name = "DISCARD_OBJ", .type = SD_OP_TYPE_LOCAL, .process_work = local_discard_obj, }, [SD_OP_FLUSH_DEL_CACHE] = { .name = "DEL_CACHE", .type = SD_OP_TYPE_LOCAL, .process_work = local_flush_and_del, }, [SD_OP_TRACE_ENABLE] = { .name = "TRACE_ENABLE", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_enable, }, [SD_OP_TRACE_DISABLE] = { .name = "TRACE_DISABLE", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_disable, }, [SD_OP_TRACE_STATUS] = { .name = "TRACE_STATUS", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_status, }, [SD_OP_TRACE_READ_BUF] = { .name = "TRACE_READ_BUF", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_trace_read_buf, }, [SD_OP_KILL_NODE] = { .name = "KILL_NODE", .type = SD_OP_TYPE_LOCAL, .force = true, .is_admin_op = true, .process_main = local_kill_node, }, [SD_OP_MD_INFO] = { .name = "MD_INFO", .type = SD_OP_TYPE_LOCAL, .process_work = local_md_info, }, [SD_OP_MD_PLUG] = { .name = "MD_PLUG_DISKS", .type = SD_OP_TYPE_LOCAL, .is_admin_op = true, .process_main = local_md_plug, }, [SD_OP_MD_UNPLUG] = { .name = "MD_UNPLUG_DISKS", .type = SD_OP_TYPE_LOCAL, .is_admin_op = true, .process_main = local_md_unplug, }, [SD_OP_GET_HASH] = { .name = "GET_HASH", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_hash, }, [SD_OP_GET_CACHE_INFO] = { .name = "GET_CACHE_INFO", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_cache_info, }, [SD_OP_CACHE_PURGE] = { .name = "CACHE_PURGE", .type = SD_OP_TYPE_LOCAL, .process_work = local_cache_purge, }, [SD_OP_STAT] = { .name = "STAT", .type = SD_OP_TYPE_LOCAL, .process_main = local_sd_stat, }, [SD_OP_GET_LOGLEVEL] = { .name = "GET_LOGLEVEL", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_get_loglevel, }, [SD_OP_SET_LOGLEVEL] = { .name = "SET_LOGLEVEL", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_set_loglevel, }, /* gateway I/O operations */ [SD_OP_CREATE_AND_WRITE_OBJ] = { .name = "CREATE_AND_WRITE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_create_and_write_obj, }, [SD_OP_READ_OBJ] = { .name = "READ_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_read_obj, }, [SD_OP_WRITE_OBJ] = { .name = "WRITE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_write_obj, }, [SD_OP_REMOVE_OBJ] = { .name = "REMOVE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_remove_obj, }, /* peer I/O operations */ [SD_OP_CREATE_AND_WRITE_PEER] = { .name = "CREATE_AND_WRITE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_create_and_write_obj, }, [SD_OP_READ_PEER] = { .name = "READ_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_read_obj, }, [SD_OP_WRITE_PEER] = { .name = "WRITE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_write_obj, }, [SD_OP_REMOVE_PEER] = { .name = "REMOVE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_remove_obj, }, }; const struct sd_op_template *get_sd_op(uint8_t opcode) { if (sd_ops[opcode].type == 0) return NULL; return sd_ops + opcode; } const char *op_name(const struct sd_op_template *op) { if (op == NULL) return "(invalid opcode)"; return op->name; } bool is_cluster_op(const struct sd_op_template *op) { return op != NULL && op->type == SD_OP_TYPE_CLUSTER; } bool is_local_op(const struct sd_op_template *op) { return op != NULL && op->type == SD_OP_TYPE_LOCAL; } bool is_peer_op(const struct sd_op_template *op) { return op != NULL && op->type == SD_OP_TYPE_PEER; } bool is_gateway_op(const struct sd_op_template *op) { return op != NULL && op->type == SD_OP_TYPE_GATEWAY; } bool is_force_op(const struct sd_op_template *op) { return op != NULL && op->force; } bool is_logging_op(const struct sd_op_template *op) { return op != NULL && op->is_admin_op; } bool has_process_work(const struct sd_op_template *op) { return op != NULL && !!op->process_work; } bool has_process_main(const struct sd_op_template *op) { return op != NULL && !!op->process_main; } void do_process_work(struct work *work) { struct request *req = container_of(work, struct request, work); int ret = SD_RES_SUCCESS; sd_debug("%x, %" PRIx64", %"PRIu32, req->rq.opcode, req->rq.obj.oid, req->rq.epoch); if (req->op->process_work) ret = req->op->process_work(req); if (ret != SD_RES_SUCCESS) { sd_debug("failed: %x, %" PRIx64" , %u, %s", req->rq.opcode, req->rq.obj.oid, req->rq.epoch, sd_strerror(ret)); } req->rp.result = ret; } int do_process_main(const struct sd_op_template *op, const struct sd_req *req, struct sd_rsp *rsp, void *data) { return op->process_main(req, rsp, data); } static int map_table[] = { [SD_OP_CREATE_AND_WRITE_OBJ] = SD_OP_CREATE_AND_WRITE_PEER, [SD_OP_READ_OBJ] = SD_OP_READ_PEER, [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER, [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER, }; int gateway_to_peer_opcode(int opcode) { assert(opcode < ARRAY_SIZE(map_table)); return map_table[opcode]; } sheepdog-0.8.3/sheep/plain_store.c000066400000000000000000000376301237656255000171330ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "sheep_priv.h" #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; }) static inline bool iocb_is_aligned(const struct siocb *iocb) { return sector_algined(iocb->offset) && sector_algined(iocb->length); } static int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create) { int flags = O_DSYNC | O_RDWR; if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) flags &= ~O_DSYNC; if (sys->backend_dio && iocb_is_aligned(iocb)) { if (!is_aligned_to_pagesize(iocb->buf)) panic("Memory isn't aligned to pagesize %p", iocb->buf); flags |= O_DIRECT; } if (create) flags |= O_CREAT | O_EXCL; return flags; } int get_store_path(uint64_t oid, uint8_t ec_index, char *path) { if (is_erasure_oid(oid)) { if (unlikely(ec_index >= SD_MAX_COPIES)) panic("invalid ec_index %d", ec_index); return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d", md_get_object_dir(oid), oid, ec_index); } return snprintf(path, PATH_MAX, "%s/%016" PRIx64, md_get_object_dir(oid), oid); } static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path) { if (is_erasure_oid(oid)) { if (unlikely(ec_index >= SD_MAX_COPIES)) panic("invalid ec_index %d", ec_index); return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp", md_get_object_dir(oid), oid, ec_index); } return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp", md_get_object_dir(oid), oid); } static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index, char *path) { return md_get_stale_path(oid, epoch, ec_index, path); } /* * Check if oid is in this nodes (if oid is in the wrong place, it will be moved * to the correct one after this call in a MD setup. */ bool default_exist(uint64_t oid, uint8_t ec_index) { return md_exist(oid, ec_index); } static int err_to_sderr(const char *path, uint64_t oid, int err) { struct stat s; char p[PATH_MAX], *dir; /* Use a temporary buffer since dirname() may modify its argument. */ pstrcpy(p, sizeof(p), path); dir = dirname(p); sd_debug("%s", path); switch (err) { case ENOENT: if (stat(dir, &s) < 0) { sd_err("%s corrupted", dir); return md_handle_eio(dir); } sd_debug("object %016" PRIx64 " not found locally", oid); return SD_RES_NO_OBJ; case ENOSPC: /* TODO: stop automatic recovery */ sd_err("diskfull, oid=%"PRIx64, oid); return SD_RES_NO_SPACE; case EMFILE: case ENFILE: case EINTR: case EAGAIN: case EEXIST: sd_err("%m, oid=%"PRIx64, oid); /* make gateway try again */ return SD_RES_NETWORK_ERROR; default: sd_err("oid=%"PRIx64", %m", oid); return md_handle_eio(dir); } } int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_store_path(oid, iocb->ec_index, path); /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less/more disks' or * any bugs. We need call err_to_sderr() to return EIO if disk is broken */ if (!default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, ENOENT); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; } static int make_stale_dir(const char *path) { char p[PATH_MAX]; snprintf(p, PATH_MAX, "%s/.stale", path); if (xmkdir(p, sd_def_dmode) < 0) { sd_err("%s failed, %m", p); return SD_RES_EIO; } return SD_RES_SUCCESS; } static int purge_dir(const char *path) { if (purge_directory(path) < 0) return SD_RES_EIO; return SD_RES_SUCCESS; } static int purge_stale_dir(const char *path) { char p[PATH_MAX]; snprintf(p, PATH_MAX, "%s/.stale", path); return purge_dir(p); } int default_cleanup(void) { int ret; ret = for_each_obj_path(purge_stale_dir); if (ret != SD_RES_SUCCESS) return ret; return SD_RES_SUCCESS; } static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch) { int ret; struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE); struct siocb iocb = { .epoch = epoch, .buf = inode, .length = SD_INODE_HEADER_SIZE, }; ret = default_read(oid, &iocb); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode header %" PRIx64 " %" PRId32 "wat %s", oid, epoch, wd); goto out; } add_vdi_state(oid_to_vid(oid), inode->nr_copies, vdi_is_snapshot(inode), inode->copy_policy); atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = SD_RES_SUCCESS; out: free(inode); return ret; } static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32 " at %s", oid, epoch, wd); ret = init_vdi_state(oid, wd, epoch); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; } int default_init(void) { int ret; sd_debug("use plain store driver"); ret = for_each_obj_path(make_stale_dir); if (ret != SD_RES_SUCCESS) return ret; for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL); return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL); } static int default_read_from_path(uint64_t oid, const char *path, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; ssize_t size; /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less disks' or any * bugs. We need call err_to_sderr() to return EIO if disk is broken. * * For stale path, get_store_stale_path already does default_exist job. */ if (!is_stale_path(path) && !default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, ENOENT); fd = open(path, flags); if (fd < 0) return err_to_sderr(path, oid, errno); size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to read object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); } close(fd); return ret; } int default_read(uint64_t oid, const struct siocb *iocb) { int ret; char path[PATH_MAX]; get_store_path(oid, iocb->ec_index, path); ret = default_read_from_path(oid, path, iocb); /* * If the request is againt the older epoch, try to read from * the stale directory */ if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 && iocb->epoch < sys_epoch()) { get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path); ret = default_read_from_path(oid, path, iocb); } return ret; } /* Preallocate the whole object to get a better filesystem layout. */ int prealloc(int fd, uint32_t size) { int ret = xfallocate(fd, 0, 0, size); if (ret < 0) { if (errno != ENOSYS && errno != EOPNOTSUPP) { sd_err("failed to preallocate space, %m"); return ret; } return xftruncate(fd, size); } return 0; } size_t get_store_objsize(uint64_t oid) { if (is_erasure_oid(oid)) { uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); } int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; } int default_link(uint64_t oid, uint32_t tgt_epoch) { char path[PATH_MAX], stale_path[PATH_MAX]; sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid, tgt_epoch); snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(oid), oid); get_store_stale_path(oid, tgt_epoch, 0, stale_path); if (link(stale_path, path) < 0) { /* * Recovery thread and main thread might try to recover the * same object and we might get EEXIST in such case. */ if (errno == EEXIST) goto out; sd_debug("failed to link from %s to %s, %m", stale_path, path); return err_to_sderr(path, oid, errno); } out: return SD_RES_SUCCESS; } /* * For replicated object, if any of the replica belongs to this node, we * consider it not stale. * * For erasured object, since every copy is unique and if it migrates to other * node(index gets changed even it has some other copy belongs to it) because * of hash ring changes, we consider it stale. */ static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo) { uint32_t i, nr_copies; const struct sd_vnode *v; bool ret = true; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { if (ec_index < SD_MAX_COPIES) { if (i == ec_index) ret = false; } else { ret = false; } break; } } return ret; } static int move_object_to_stale_dir(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { char path[PATH_MAX], stale_path[PATH_MAX]; uint32_t tgt_epoch = *(uint32_t *)arg; /* ec_index from md.c is reliable so we can directly use it */ if (ec_index < SD_MAX_COPIES) { snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d", md_get_object_dir(oid), oid, ec_index); snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64"_%d.%"PRIu32, md_get_object_dir(oid), oid, ec_index, tgt_epoch); } else { snprintf(path, PATH_MAX, "%s/%016" PRIx64, md_get_object_dir(oid), oid); snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, md_get_object_dir(oid), oid, tgt_epoch); } if (unlikely(rename(path, stale_path)) < 0) { sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid, path); return SD_RES_EIO; } sd_debug("moved object %"PRIx64, oid); return SD_RES_SUCCESS; } static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { if (oid_stale(oid, ec_index, vinfo)) return move_object_to_stale_dir(oid, wd, 0, ec_index, NULL, arg); return SD_RES_SUCCESS; } int default_update_epoch(uint32_t epoch) { assert(epoch); return for_each_object_in_wd(check_stale_objects, false, &epoch); } int default_format(void) { unsigned ret; sd_debug("try get a clean store"); ret = for_each_obj_path(purge_dir); if (ret != SD_RES_SUCCESS) return ret; if (sys->enable_object_cache) object_cache_format(); return SD_RES_SUCCESS; } int default_remove_object(uint64_t oid, uint8_t ec_index) { char path[PATH_MAX]; if (uatomic_is_true(&sys->use_journal)) journal_remove_object(oid); get_store_path(oid, ec_index, path); if (unlink(path) < 0) { if (errno == ENOENT) return SD_RES_NO_OBJ; sd_err("failed, %s, %m", path); return SD_RES_EIO; } return SD_RES_SUCCESS; } #define SHA1NAME "user.obj.sha1" static int get_object_sha1(const char *path, uint8_t *sha1) { if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE) != SHA1_DIGEST_SIZE) { if (errno == ENODATA) sd_debug("sha1 is not cached yet, %s", path); else sd_err("fail to get xattr, %s", path); return -1; } return 0; } static int set_object_sha1(const char *path, const uint8_t *sha1) { int ret; ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0); if (ret < 0) sd_err("fail to set sha1, %s", path); return ret; } static int get_object_path(uint64_t oid, uint32_t epoch, char *path, size_t size) { if (default_exist(oid, 0)) { snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(oid), oid); } else { get_store_stale_path(oid, epoch, 0, path); if (access(path, F_OK) < 0) { if (errno == ENOENT) return SD_RES_NO_OBJ; return SD_RES_EIO; } } return SD_RES_SUCCESS; } int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1) { int ret; void *buf; struct siocb iocb = {}; uint32_t length; bool is_readonly_obj = oid_is_readonly(oid); char path[PATH_MAX]; ret = get_object_path(oid, epoch, path, sizeof(path)); if (ret != SD_RES_SUCCESS) return ret; if (is_readonly_obj) { if (get_object_sha1(path, sha1) == 0) { sd_debug("use cached sha1 digest %s", sha1_to_hex(sha1)); return SD_RES_SUCCESS; } } length = get_store_objsize(oid); buf = valloc(length); if (buf == NULL) return SD_RES_NO_MEM; iocb.epoch = epoch; iocb.buf = buf; iocb.length = length; ret = default_read_from_path(oid, path, &iocb); if (ret != SD_RES_SUCCESS) { free(buf); return ret; } get_buffer_sha1(buf, length, sha1); free(buf); sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid, epoch, sha1_to_hex(sha1)); if (is_readonly_obj) set_object_sha1(path, sha1); return ret; } int default_purge_obj(void) { uint32_t tgt_epoch = get_latest_epoch(); return for_each_object_in_wd(move_object_to_stale_dir, true, &tgt_epoch); } static struct store_driver plain_store = { .name = "plain", .init = default_init, .exist = default_exist, .create_and_write = default_create_and_write, .write = default_write, .read = default_read, .link = default_link, .update_epoch = default_update_epoch, .cleanup = default_cleanup, .format = default_format, .remove_object = default_remove_object, .get_hash = default_get_hash, .purge_obj = default_purge_obj, }; add_store_driver(plain_store); sheepdog-0.8.3/sheep/recovery.c000066400000000000000000000666561237656255000164640ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2012-2013 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* base structure for the recovery thread */ struct recovery_work { uint32_t epoch; uint32_t tgt_epoch; struct vnode_info *old_vinfo; struct vnode_info *cur_vinfo; struct work work; }; /* for preparing lists */ struct recovery_list_work { struct recovery_work base; uint64_t count; uint64_t *oids; }; /* for recoverying objects */ struct recovery_obj_work { struct recovery_work base; uint64_t oid; /* the object to be recovered */ bool stop; /* local replica in the stale directory */ uint32_t local_epoch; uint8_t local_sha1[SHA1_DIGEST_SIZE]; }; /* * recovery information * * We cannot access the members of this structure outside of the main thread. */ struct recovery_info { enum rw_state state; uint32_t epoch; uint32_t tgt_epoch; uint64_t done; /* * true when automatic recovery is disabled * and no recovery work is running */ bool suspended; bool notify_complete; uint64_t count; uint64_t *oids; uint64_t *prio_oids; uint64_t nr_prio_oids; uint64_t nr_scheduled_prio_oids; struct vnode_info *old_vinfo; struct vnode_info *cur_vinfo; }; static struct recovery_info *next_rinfo; static main_thread(struct recovery_info *) current_rinfo; static void queue_recovery_work(struct recovery_info *rinfo); /* Dynamically grown list buffer default as 4M (2T storage) */ #define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22) static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE; static int obj_cmp(const uint64_t *oid1, const uint64_t *oid2) { const uint64_t hval1 = sd_hash_oid(*oid1); const uint64_t hval2 = sd_hash_oid(*oid2); return intcmp(hval1, hval2); } static inline bool node_is_gateway_only(void) { return sys->this_node.nr_vnodes == 0; } static struct vnode_info *rollback_vnode_info(uint32_t *epoch, struct vnode_info *cur) { struct vnode_info *vinfo; rollback: *epoch -= 1; if (*epoch < last_gathered_epoch) return NULL; vinfo = get_vnode_info_epoch(*epoch, cur); if (!vinfo) { /* We rollback in case we don't get a valid epoch */ sd_alert("cannot get epoch %d", *epoch); sd_alert("clients may see old data"); goto rollback; } return vinfo; } /* * A node that does not match any node in current node list means the node has * left the cluster, then it's an invalid node. */ static bool invalid_node(const struct sd_node *n, struct vnode_info *info) { if (rb_search(&info->nroot, n, rb, node_cmp)) return false; return true; } static int search_erasure_object(uint64_t oid, uint8_t idx, struct rb_root *nroot, struct recovery_work *rw, uint32_t tgt_epoch, void *buf) { struct sd_req hdr; unsigned rlen = get_store_objsize(oid); struct sd_node *n; uint32_t epoch = rw->epoch; rb_for_each_entry(n, nroot, rb) { if (invalid_node(n, rw->cur_vinfo)) continue; sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; hdr.obj.ec_index = idx; sd_debug("%"PRIx64" epoch %"PRIu32" tgt %"PRIu32" idx %d, %s", oid, epoch, tgt_epoch, idx, node_to_str(n)); if (sheep_exec_req(&n->nid, &hdr, buf) == SD_RES_SUCCESS) return SD_RES_SUCCESS; } return SD_RES_NO_OBJ; } static void *read_erasure_object(uint64_t oid, uint8_t idx, struct recovery_obj_work *row) { struct sd_req hdr; unsigned rlen = get_store_objsize(oid); void *buf = xvalloc(rlen); struct recovery_work *rw = &row->base; struct vnode_info *old = grab_vnode_info(rw->old_vinfo), *new_old; uint32_t epoch = rw->epoch, tgt_epoch = rw->tgt_epoch; const struct sd_node *node; uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int edp = ec_policy_to_dp(policy, NULL, NULL); int ret; again: if (unlikely(old->nr_zones < edp)) { if (search_erasure_object(oid, idx, &old->nroot, rw, tgt_epoch, buf) == SD_RES_SUCCESS) goto done; else goto rollback; } node = oid_to_node(oid, &old->vroot, idx); sd_debug("%"PRIx64" epoch %"PRIu32" tgt %"PRIu32" idx %d, %s", oid, epoch, tgt_epoch, idx, node_to_str(node)); if (invalid_node(node, rw->cur_vinfo)) goto rollback; sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; hdr.obj.ec_index = idx; ret = sheep_exec_req(&node->nid, &hdr, buf); switch (ret) { case SD_RES_SUCCESS: goto done; case SD_RES_OLD_NODE_VER: free(buf); buf = NULL; row->stop = true; break; default: rollback: new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo); if (!new_old) { sd_err("can not read %"PRIx64" idx %d", oid, idx); free(buf); buf = NULL; goto done; } put_vnode_info(old); old = new_old; goto again; } done: put_vnode_info(old); return buf; } /* * Read object from targeted node and store it in the local node. * * tgt_epoch: the specific epoch that the object has stayed * idx: erasure index. For non-erasure object, pass 0. */ static int recover_object_from(struct recovery_obj_work *row, const struct sd_node *node, uint32_t tgt_epoch) { uint64_t oid = row->oid; uint32_t local_epoch = row->local_epoch; uint8_t *sha1 = row->local_sha1; uint32_t epoch = row->base.epoch; int ret; unsigned rlen; void *buf = NULL; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct siocb iocb = { 0 }; if (node_is_local(node)) { if (tgt_epoch < sys_epoch()) return sd_store->link(oid, tgt_epoch); return SD_RES_NO_OBJ; } /* compare sha1 hash value first */ if (local_epoch > 0) { sd_init_req(&hdr, SD_OP_GET_HASH); hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&node->nid, &hdr, NULL); if (ret != SD_RES_SUCCESS) return ret; if (memcmp(rsp->hash.digest, sha1, sizeof(SHA1_DIGEST_SIZE)) == 0) { sd_debug("use local replica at epoch %d", local_epoch); ret = sd_store->link(oid, local_epoch); if (ret == SD_RES_SUCCESS) return ret; } } rlen = get_store_objsize(oid); buf = xvalloc(rlen); /* recover from remote replica */ sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&node->nid, &hdr, buf); if (ret == SD_RES_SUCCESS) { iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); } free(buf); return ret; } static int recover_object_from_replica(struct recovery_obj_work *row, struct vnode_info *old, uint32_t tgt_epoch) { uint64_t oid = row->oid; uint32_t epoch = row->base.epoch; int nr_copies, ret = SD_RES_SUCCESS, start = 0; bool fully_replicated = true; nr_copies = get_obj_copy_number(oid, old->nr_zones); /* find local node first to try to recover from local */ for (int i = 0; i < nr_copies; i++) { const struct sd_vnode *vnode; vnode = oid_to_vnode(oid, &old->vroot, i); if (vnode_is_local(vnode)) { start = i; break; } } /* Let's do a breadth-first search */ for (int i = 0; i < nr_copies; i++) { const struct sd_node *node; int idx = (i + start) % nr_copies; node = oid_to_node(oid, &old->vroot, idx); if (invalid_node(node, row->base.cur_vinfo)) continue; ret = recover_object_from(row, node, tgt_epoch); switch (ret) { case SD_RES_SUCCESS: sd_debug("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); return ret; case SD_RES_OLD_NODE_VER: /* move to the next epoch recovery */ return ret; case SD_RES_NO_OBJ: fully_replicated = false; /* fall through */ default: break; } } /* * sheep would return a stale object when * - all the nodes hold the copies, and * - all the nodes are gone * at the some epoch */ if (fully_replicated && ret != SD_RES_SUCCESS) ret = SD_RES_STALE_OBJ; return ret; } /* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int recover_replication_object(struct recovery_obj_work *row) { struct recovery_work *rw = &row->base; struct vnode_info *old; uint64_t oid = row->oid; uint32_t tgt_epoch = rw->tgt_epoch; int ret; struct vnode_info *new_old; old = grab_vnode_info(rw->old_vinfo); again: sd_debug("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); ret = recover_object_from_replica(row, old, tgt_epoch); switch (ret) { case SD_RES_SUCCESS: /* Succeed */ break; case SD_RES_OLD_NODE_VER: row->stop = true; break; case SD_RES_STALE_OBJ: sd_alert("cannot access any replicas of %"PRIx64" at epoch %d", oid, tgt_epoch); sd_alert("clients may see old data"); /* fall through */ default: /* No luck, roll back to an older configuration and try again */ new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo); if (!new_old) { sd_err("can not recover oid %"PRIx64, oid); ret = -1; goto out; } put_vnode_info(old); old = new_old; goto again; } out: put_vnode_info(old); return ret; } static void *rebuild_erasure_object(uint64_t oid, uint8_t idx, struct recovery_obj_work *row) { int len = get_store_objsize(oid); char *lost = xvalloc(len); int i, j; uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int ed = 0, edp; edp = ec_policy_to_dp(policy, &ed, NULL); struct fec *ctx = ec_init(ed, edp); uint8_t *bufs[ed]; int idxs[ed]; for (i = 0; i < ed; i++) bufs[i] = NULL; for (i = 0; i < ed; i++) idxs[i] = 0; /* Prepare replica */ for (i = 0, j = 0; i < edp && j < ed; i++) { if (i == idx) continue; bufs[j] = read_erasure_object(oid, i, row); if (row->stop) break; if (!bufs[j]) continue; idxs[j++] = i; } if (j != ed) { free(lost); lost = NULL; goto out; } /* Rebuild the lost replica */ ec_decode_buffer(ctx, bufs, idxs, lost, idx); out: ec_destroy(ctx); for (i = 0; i < ed; i++) free(bufs[i]); return lost; } static uint8_t local_node_copy_index(struct vnode_info *vinfo, uint64_t oid) { int idx; if (!is_erasure_oid(oid)) return 0; /* no need to proceed */ for (idx = 0; idx < vinfo->nr_zones; idx++) { const struct sd_node *n = oid_to_node(oid, &vinfo->vroot, idx); if (node_is_local(n)) return idx; } panic("can't get valid index for %"PRIx64, oid); } /* * Erasure object recovery algorithm * * 1. read the lost object from its track in epoch history vertically because * every copy that holds partial data of the object is unique * 2. if not found in 1, then tries to rebuild it with RS algorithm * 2.1 read enough other copies from their tracks in epoch history * 2.2 rebuild the lost object from the content of copies read at 2.1 * * The subtle case is number for available zones is less than total copy number * or the requested index of lost object: * 1 we need to make sure nr_zones >= total_copy_nr to avoid panic of * oid_to_node(s) helpers. * 2 we have to search all the available zones when we can't get idx. Its * okay to do a mad search when number of available zones is small */ static int recover_erasure_object(struct recovery_obj_work *row) { struct recovery_work *rw = &row->base; struct vnode_info *cur = rw->cur_vinfo; uint64_t oid = row->oid; struct siocb iocb = { 0 }; void *buf = NULL; uint8_t idx; int ret = -1; idx = local_node_copy_index(cur, oid); buf = read_erasure_object(oid, idx, row); if (!buf && !row->stop) buf = rebuild_erasure_object(oid, idx, row); if (!buf) { if (!row->stop) sd_err("failed to recover %"PRIx64" idx %d", oid, idx); goto out; } iocb.epoch = rw->epoch; iocb.length = get_store_objsize(oid); iocb.offset = 0; iocb.buf = buf; iocb.ec_index = idx; ret = sd_store->create_and_write(oid, &iocb); free(buf); out: return ret; } static int do_recover_object(struct recovery_obj_work *row) { uint64_t oid = row->oid; if (is_erasure_oid(oid)) return recover_erasure_object(row); else return recover_replication_object(row); } static void recover_object_work(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_obj_work *row = container_of(rw, struct recovery_obj_work, base); uint64_t oid = row->oid; struct vnode_info *cur = rw->cur_vinfo; int ret, epoch; if (sd_store->exist(oid, local_node_copy_index(cur, oid))) { sd_debug("the object is already recovered"); return; } /* find object in the stale directory */ if (!is_erasure_oid(oid)) for (epoch = sys_epoch() - 1; epoch >= last_gathered_epoch; epoch--) { ret = sd_store->get_hash(oid, epoch, row->local_sha1); if (ret == SD_RES_SUCCESS) { sd_debug("replica found in local at epoch %d", epoch); row->local_epoch = epoch; break; } } ret = do_recover_object(row); if (ret != 0) sd_err("failed to recover object %"PRIx64, oid); } bool node_in_recovery(void) { return main_thread_get(current_rinfo) != NULL; } static inline void prepare_schedule_oid(uint64_t oid) { struct recovery_info *rinfo = main_thread_get(current_rinfo); if (xlfind(&oid, rinfo->prio_oids, rinfo->nr_prio_oids, oid_cmp)) { sd_debug("%" PRIx64 " has been already in prio_oids", oid); return; } rinfo->nr_prio_oids++; rinfo->prio_oids = xrealloc(rinfo->prio_oids, rinfo->nr_prio_oids * sizeof(uint64_t)); rinfo->prio_oids[rinfo->nr_prio_oids - 1] = oid; sd_debug("%"PRIx64" nr_prio_oids %"PRIu64, oid, rinfo->nr_prio_oids); resume_suspended_recovery(); } main_fn bool oid_in_recovery(uint64_t oid) { struct recovery_info *rinfo = main_thread_get(current_rinfo); struct vnode_info *cur; if (!node_in_recovery()) return false; cur = rinfo->cur_vinfo; if (sd_store->exist(oid, local_node_copy_index(cur, oid))) { sd_debug("the object %" PRIx64 " is already recoverd", oid); return false; } if (uatomic_read(&next_rinfo)) /* * The current recovery_info will be taken over by the next one * soon, so no need to call prepare_schedule_oid() now. */ return true; switch (rinfo->state) { case RW_PREPARE_LIST: /* oid is not recovered yet */ break; case RW_RECOVER_OBJ: if (xlfind(&oid, rinfo->oids, rinfo->done, oid_cmp)) { sd_debug("%" PRIx64 " has been already recovered", oid); return false; } if (rinfo->oids[rinfo->done] == oid) { if (rinfo->suspended) break; /* * When recovery is not suspended, * rinfo->oids[rinfo->done] is currently being recovered * and no need to call prepare_schedule_oid(). */ return true; } /* * Check if oid is in the list that to be recovered later * * FIXME: do we need more efficient yet complex data structure? */ if (xlfind(&oid, rinfo->oids + rinfo->done + 1, rinfo->count - (rinfo->done + 1), oid_cmp)) break; /* * Newly created object after prepare_object_list() might not be * in the list */ sd_debug("%"PRIx64" is not in the recovery list", oid); return false; case RW_NOTIFY_COMPLETION: sd_debug("the object %" PRIx64 " is already recoverd", oid); return false; } prepare_schedule_oid(oid); return true; } static void free_recovery_work(struct recovery_work *rw) { put_vnode_info(rw->cur_vinfo); put_vnode_info(rw->old_vinfo); free(rw); } static void free_recovery_list_work(struct recovery_list_work *rlw) { put_vnode_info(rlw->base.cur_vinfo); put_vnode_info(rlw->base.old_vinfo); free(rlw->oids); free(rlw); } static void free_recovery_obj_work(struct recovery_obj_work *row) { put_vnode_info(row->base.cur_vinfo); put_vnode_info(row->base.old_vinfo); free(row); } static void free_recovery_info(struct recovery_info *rinfo) { put_vnode_info(rinfo->cur_vinfo); put_vnode_info(rinfo->old_vinfo); free(rinfo->oids); free(rinfo->prio_oids); free(rinfo); } /* Return true if next recovery work is queued. */ static inline bool run_next_rw(void) { struct recovery_info *nrinfo = uatomic_xchg_ptr(&next_rinfo, NULL); struct recovery_info *cur = main_thread_get(current_rinfo); if (nrinfo == NULL) return false; /* * When md recovery supersed the reweight or node recovery, we need to * notify completion. */ if (!nrinfo->notify_complete && cur->notify_complete) nrinfo->notify_complete = true; free_recovery_info(cur); if (!node_is_gateway_only()) sd_store->update_epoch(nrinfo->tgt_epoch); main_thread_set(current_rinfo, nrinfo); wakeup_all_requests(); queue_recovery_work(nrinfo); sd_debug("recovery work is superseded"); return true; } static void notify_recovery_completion_work(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct sd_req hdr; int ret; sd_init_req(&hdr, SD_OP_COMPLETE_RECOVERY); hdr.obj.tgt_epoch = rw->epoch; hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(sys->this_node); ret = exec_local_req(&hdr, &sys->this_node); if (ret != SD_RES_SUCCESS) sd_err("failed to notify recovery completion, %d", rw->epoch); } static void notify_recovery_completion_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); free_recovery_work(rw); } static inline void finish_recovery(struct recovery_info *rinfo) { uint32_t recovered_epoch = rinfo->epoch; main_thread_set(current_rinfo, NULL); wakeup_all_requests(); if (rinfo->notify_complete) { rinfo->state = RW_NOTIFY_COMPLETION; queue_recovery_work(rinfo); } free_recovery_info(rinfo); sd_debug("recovery complete: new epoch %"PRIu32, recovered_epoch); } static inline bool oid_in_prio_oids(struct recovery_info *rinfo, uint64_t oid) { for (uint64_t i = 0; i < rinfo->nr_prio_oids; i++) if (rinfo->prio_oids[i] == oid) return true; return false; } /* * Schedule prio_oids to be recovered first in FIFO order * * rw->done is index of the original next object to be recovered and also the * number of objects already recovered. * we just move rw->prio_oids in between: * new_oids = [0..rw->done - 1] + [rw->prio_oids] + [rw->done] */ static inline void finish_schedule_oids(struct recovery_info *rinfo) { uint64_t i, nr_recovered = rinfo->done, new_idx; uint64_t *new_oids; /* If I am the last oid, done */ if (nr_recovered == rinfo->count - 1) goto done; new_oids = xmalloc(list_buffer_size); memcpy(new_oids, rinfo->oids, nr_recovered * sizeof(uint64_t)); memcpy(new_oids + nr_recovered, rinfo->prio_oids, rinfo->nr_prio_oids * sizeof(uint64_t)); new_idx = nr_recovered + rinfo->nr_prio_oids; for (i = rinfo->done; i < rinfo->count; i++) { if (oid_in_prio_oids(rinfo, rinfo->oids[i])) continue; new_oids[new_idx++] = rinfo->oids[i]; } /* rw->count should eq new_idx, otherwise something is wrong */ sd_debug("%snr_recovered %" PRIu64 ", nr_prio_oids %" PRIu64 ", count %" PRIu64 " = new %" PRIu64, rinfo->count == new_idx ? "" : "WARN: ", nr_recovered, rinfo->nr_prio_oids, rinfo->count, new_idx); free(rinfo->oids); rinfo->oids = new_oids; done: free(rinfo->prio_oids); rinfo->prio_oids = NULL; rinfo->nr_scheduled_prio_oids += rinfo->nr_prio_oids; rinfo->nr_prio_oids = 0; } /* * When automatic object recovery is disabled, the behavior of the * recovery process is like 'lazy recovery'. This function returns * true if the recovery queue contains objects being accessed by * clients. Sheep recovers such objects for availability even when * automatic object recovery is not enabled. */ static bool has_scheduled_objects(struct recovery_info *rinfo) { return rinfo->done < rinfo->nr_scheduled_prio_oids; } static void recover_next_object(struct recovery_info *rinfo) { if (run_next_rw()) return; if (rinfo->nr_prio_oids) finish_schedule_oids(rinfo); if (sys->cinfo.disable_recovery && !has_scheduled_objects(rinfo)) { sd_debug("suspended"); rinfo->suspended = true; /* suspend until resume_suspended_recovery() is called */ return; } /* Try recover next object */ queue_recovery_work(rinfo); } void resume_suspended_recovery(void) { struct recovery_info *rinfo = main_thread_get(current_rinfo); if (rinfo && rinfo->suspended) { rinfo->suspended = false; recover_next_object(rinfo); } } static void recover_object_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_obj_work *row = container_of(rw, struct recovery_obj_work, base); struct recovery_info *rinfo = main_thread_get(current_rinfo); if (run_next_rw()) goto out; if (row->stop) { /* * Stop this recovery process and wait for epoch to be * lifted and flush wait queue to requeue those * requests */ rinfo->notify_complete = false; finish_recovery(rinfo); sd_debug("recovery is stopped"); goto out; } wakeup_requests_on_oid(row->oid); rinfo->done++; if (!(rinfo->done % DIV_ROUND_UP(rinfo->count, 100))) sd_info("object recovery progress %3.0lf%% ", (double)rinfo->done / rinfo->count * 100); sd_debug("object %"PRIx64" is recovered (%"PRIu64"/%"PRIu64")", row->oid, rinfo->done, rinfo->count); if (rinfo->done < rinfo->count) { recover_next_object(rinfo); goto out; } finish_recovery(rinfo); out: free_recovery_obj_work(row); } static void finish_object_list(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_list_work *rlw = container_of(rw, struct recovery_list_work, base); struct recovery_info *rinfo = main_thread_get(current_rinfo); rinfo->state = RW_RECOVER_OBJ; rinfo->count = rlw->count; rinfo->oids = rlw->oids; rlw->oids = NULL; free_recovery_list_work(rlw); if (run_next_rw()) return; if (!rinfo->count) { finish_recovery(rinfo); return; } recover_next_object(rinfo); return; } /* Fetch the object list from all the nodes in the cluster */ static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch, size_t *nr_oids) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; size_t buf_size = list_buffer_size; uint64_t *buf = xmalloc(buf_size); int ret; sd_debug("%s", addr_to_str(e->nid.addr, e->nid.port)); retry: sd_init_req(&hdr, SD_OP_GET_OBJ_LIST); hdr.data_length = buf_size; hdr.epoch = epoch; ret = sheep_exec_req(&e->nid, &hdr, buf); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_BUFFER_SMALL: buf_size *= 2; buf = xrealloc(buf, buf_size); goto retry; default: sd_alert("cannot get object list from %s", addr_to_str(e->nid.addr, e->nid.port)); sd_alert("some objects may be not recovered at epoch %d", epoch); free(buf); return NULL; } *nr_oids = rsp->data_length / sizeof(uint64_t); sd_debug("%zu", *nr_oids); return buf; } /* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_list_work *rlw, uint64_t *oids, size_t nr_oids) { struct recovery_work *rw = &rlw->base; const struct sd_vnode *vnodes[SD_MAX_COPIES]; uint64_t old_count = rlw->count; uint64_t nr_objs; uint64_t i, j; for (i = 0; i < nr_oids; i++) { if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp)) /* the object is already scheduled to be recovered */ continue; nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; rlw->oids[rlw->count++] = oids[i]; /* enlarge the list buffer if full */ if (rlw->count == list_buffer_size / sizeof(uint64_t)) { list_buffer_size *= 2; rlw->oids = xrealloc(rlw->oids, list_buffer_size); } break; } } xqsort(rlw->oids, rlw->count, obj_cmp); } /* Prepare the object list that belongs to this node */ static void prepare_object_list(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_list_work *rlw = container_of(rw, struct recovery_list_work, base); int nr_nodes = rw->cur_vinfo->nr_nodes; int start = random() % nr_nodes, i, end = nr_nodes; uint64_t *oids; struct sd_node *nodes; if (node_is_gateway_only()) return; sd_debug("%u", rw->epoch); wait_get_vdis_done(); nodes = xmalloc(sizeof(struct sd_node) * nr_nodes); nodes_to_buffer(&rw->cur_vinfo->nroot, nodes); again: /* We need to start at random node for better load balance */ for (i = start; i < end; i++) { size_t nr_oids; struct sd_node *node = nodes + i; if (uatomic_read(&next_rinfo)) { sd_debug("go to the next recovery"); goto out; } oids = fetch_object_list(node, rw->epoch, &nr_oids); if (!oids) continue; screen_object_list(rlw, oids, nr_oids); free(oids); } if (start != 0) { end = start; start = 0; goto again; } sd_debug("%"PRIu64, rlw->count); out: free(nodes); } int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, bool epoch_lifted) { struct recovery_info *rinfo; rinfo = xzalloc(sizeof(struct recovery_info)); rinfo->state = RW_PREPARE_LIST; rinfo->epoch = sys->cinfo.epoch; rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 : sys->cinfo.epoch; rinfo->count = 0; if (epoch_lifted) rinfo->notify_complete = true; /* Reweight or node recovery */ else rinfo->notify_complete = false; /* MD recovery */ rinfo->cur_vinfo = grab_vnode_info(cur_vinfo); rinfo->old_vinfo = grab_vnode_info(old_vinfo); if (!node_is_gateway_only()) sd_store->update_epoch(rinfo->tgt_epoch); if (main_thread_get(current_rinfo) != NULL) { /* skip the previous epoch recovery */ struct recovery_info *nrinfo; nrinfo = uatomic_xchg_ptr(&next_rinfo, rinfo); if (nrinfo) free_recovery_info(nrinfo); sd_debug("recovery skipped"); /* * This is necesary to invoke run_next_rw when * recovery work is suspended. */ resume_suspended_recovery(); } else { main_thread_set(current_rinfo, rinfo); queue_recovery_work(rinfo); } wakeup_requests_on_epoch(); return 0; } static void queue_recovery_work(struct recovery_info *rinfo) { struct recovery_work *rw; struct recovery_list_work *rlw; struct recovery_obj_work *row; switch (rinfo->state) { case RW_PREPARE_LIST: rlw = xzalloc(sizeof(*rlw)); rlw->oids = xmalloc(list_buffer_size); rw = &rlw->base; rw->work.fn = prepare_object_list; rw->work.done = finish_object_list; break; case RW_RECOVER_OBJ: row = xzalloc(sizeof(*row)); row->oid = rinfo->oids[rinfo->done]; rw = &row->base; rw->work.fn = recover_object_work; rw->work.done = recover_object_main; break; case RW_NOTIFY_COMPLETION: rw = xzalloc(sizeof(*rw)); rw->work.fn = notify_recovery_completion_work; rw->work.done = notify_recovery_completion_main; break; default: panic("unknow recovery state %d", rinfo->state); break; } rw->epoch = rinfo->epoch; rw->tgt_epoch = rinfo->tgt_epoch; rw->cur_vinfo = grab_vnode_info(rinfo->cur_vinfo); rw->old_vinfo = grab_vnode_info(rinfo->old_vinfo); queue_work(sys->recovery_wqueue, &rw->work); } void get_recovery_state(struct recovery_state *state) { struct recovery_info *rinfo = main_thread_get(current_rinfo); memset(state, 0, sizeof(*state)); if (!rinfo) { state->in_recovery = 0; return; } state->in_recovery = 1; state->state = rinfo->state; state->nr_finished = rinfo->done; state->nr_total = rinfo->count; } sheepdog-0.8.3/sheep/request.c000066400000000000000000000643121237656255000163010ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "sheep_priv.h" static void requeue_request(struct request *req); static void del_requeue_request(struct request *req) { list_del(&req->request_list); requeue_request(req); } static bool is_access_local(struct request *req, uint64_t oid) { const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; int nr_copies; int i; nr_copies = get_req_copy_number(req); oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { if (vnode_is_local(obj_vnodes[i])) return true; } return false; } static void io_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); switch (req->rp.result) { case SD_RES_EIO: req->rp.result = SD_RES_NETWORK_ERROR; sd_err("leaving sheepdog cluster"); leave_cluster(); break; case SD_RES_SUCCESS: case SD_RES_NETWORK_ERROR: break; default: sd_debug("unhandled error %s", sd_strerror(req->rp.result)); break; } put_request(req); return; } /* * There are 4 cases that a request needs to sleep on wait queues for requeue: * * 1. Epoch of request sender is older than system epoch of receiver * In this case, we response the sender with SD_RES_OLD_NODE_VER to * sender so sender would put the request into its own wait queue to * wait its system epoch get lifted and resend the request. * * 2. Epoch of request sender is newer than system epoch of receiver * In this case, we put the request into wait queue of receiver, to wait * system epoch of receiver to get lifted, then retry this request on * its own. * * 3. Object requested doesn't exist and recovery work is at RW_INIT state * In this case, we check whether the requested object exists, if so, * go process the request directly, if not put the request into wait * queue of the receiver to wait for the finish of this oid recovery. * * 4. Object requested doesn't exist and is being recoverred * In this case, we put the request into wait queue of receiver and when * we recover an object we try to wake up the request on this oid. */ static inline void sleep_on_wait_queue(struct request *req) { list_add_tail(&req->request_list, &sys->req_wait_queue); } static void gateway_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); struct sd_req *hdr = &req->rq; switch (req->rp.result) { case SD_RES_OLD_NODE_VER: if (req->rp.epoch > sys->cinfo.epoch) { /* * Gateway of this node is expected to process this * request later when epoch is lifted. */ sleep_on_wait_queue(req); return; } /*FALLTHRU*/ case SD_RES_NEW_NODE_VER: case SD_RES_NETWORK_ERROR: case SD_RES_WAIT_FOR_JOIN: case SD_RES_WAIT_FOR_FORMAT: case SD_RES_KILLED: sd_debug("retrying failed I/O request op %s result %x epoch %" PRIu32 ", sys epoch %" PRIu32, op_name(req->op), req->rp.result, req->rq.epoch, sys->cinfo.epoch); goto retry; case SD_RES_EIO: if (is_access_local(req, hdr->obj.oid)) { sd_err("leaving sheepdog cluster"); leave_cluster(); goto retry; } break; case SD_RES_SUCCESS: break; default: sd_debug("unhandled error %s", sd_strerror(req->rp.result)); break; } put_request(req); return; retry: requeue_request(req); } static void local_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); if (has_process_main(req->op)) { req->rp.result = do_process_main(req->op, &req->rq, &req->rp, req->data); } put_request(req); } static int check_request_epoch(struct request *req) { if (before(req->rq.epoch, sys->cinfo.epoch)) { sd_err("old node version %u, %u (%s)", sys->cinfo.epoch, req->rq.epoch, op_name(req->op)); /* Ask for sleeping req on requester's wait queue */ req->rp.result = SD_RES_OLD_NODE_VER; req->rp.epoch = sys->cinfo.epoch; put_request(req); return -1; } else if (after(req->rq.epoch, sys->cinfo.epoch)) { sd_err("new node version %u, %u (%s)", sys->cinfo.epoch, req->rq.epoch, op_name(req->op)); /* Wait for local epoch to be lifted */ req->rp.result = SD_RES_NEW_NODE_VER; sleep_on_wait_queue(req); return -1; } return 0; } static bool request_in_recovery(struct request *req) { /* * For CREATE request, we simply service it. CREATE operations are * atomic, so it cannot happen for recover process to overwrite the * created objects with the older data. */ if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_PEER || req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ) return false; if (req->rq.flags & SD_FLAG_CMD_RECOVERY) /* * Recovery requests must not be linked to wait queue to avoid a * dead lock. Here is an example scenario. * 1. Node A sends a recovery request to node B. * 2. Node B links the request to the wait queue. * 3. Node B sends a recovery request to node A to recover the * object. * 4. Node A links the request to the wait queue, and the * object cannot be recovered on either A or B (dead lock). */ return false; if (oid_in_recovery(req->local_oid)) { sd_debug("%"PRIx64" wait on oid", req->local_oid); sleep_on_wait_queue(req); return true; } return false; } /* Wakeup requests because of epoch mismatch */ void wakeup_requests_on_epoch(void) { struct request *req; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry(req, &pending_list, request_list) { switch (req->rp.result) { case SD_RES_OLD_NODE_VER: /* * Gateway retries to send the request when * its epoch changes. */ assert(is_gateway_op(req->op)); sd_debug("gateway %"PRIx64, req->rq.obj.oid); req->rq.epoch = sys->cinfo.epoch; del_requeue_request(req); break; case SD_RES_NEW_NODE_VER: /* * Peer retries the request locally when its epoch * changes. */ assert(!is_gateway_op(req->op)); sd_debug("peer %"PRIx64, req->rq.obj.oid); del_requeue_request(req); break; default: break; } } list_splice_init(&pending_list, &sys->req_wait_queue); } /* Wakeup the requests on the oid that was previously being recoverred */ void wakeup_requests_on_oid(uint64_t oid) { struct request *req; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry(req, &pending_list, request_list) { if (req->local_oid != oid) continue; sd_debug("retry %" PRIx64, req->local_oid); del_requeue_request(req); } list_splice_init(&pending_list, &sys->req_wait_queue); } void wakeup_all_requests(void) { struct request *req; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry(req, &pending_list, request_list) { sd_debug("%"PRIx64, req->rq.obj.oid); del_requeue_request(req); } } static void queue_peer_request(struct request *req) { req->local_oid = req->rq.obj.oid; if (req->local_oid) { if (check_request_epoch(req) < 0) return; if (request_in_recovery(req)) return; } if (req->rq.flags & SD_FLAG_CMD_RECOVERY) req->rq.epoch = req->rq.obj.tgt_epoch; req->work.fn = do_process_work; req->work.done = io_op_done; queue_work(sys->io_wqueue, &req->work); } /* * We make sure we write the exact number of copies to honor the promise of the * redundancy for strict mode. This means that after writing of targeted data, * they are redundant as promised and can withstand the random node failures. * * For example, with a 4:2 policy, we need at least write to 6 nodes with data * strip and parity strips. For non-strict mode, we allow to write successfully * only if the data are written fully with 4 nodes alive. */ static bool has_enough_zones(struct request *req) { uint64_t oid = req->rq.obj.oid; return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid)); } static void queue_gateway_request(struct request *req) { struct sd_req *hdr = &req->rq; if (is_access_local(req, hdr->obj.oid)) req->local_oid = hdr->obj.oid; /* * If we go for cache object, we don't care if it is being recovered * Even if it doesn't exist in cache, we'll rely on cache layer to pull * it. * * Not ture for local request because it might go for backend store * such as pushing cache object, in this case we should check if request * is in recovery. */ if (sys->enable_object_cache && !req->local) goto queue_work; if (req->local_oid) if (request_in_recovery(req)) return; queue_work: if (RB_EMPTY_ROOT(&req->vinfo->vroot)) { sd_err("there is no living nodes"); goto end_request; } if (sys->cinfo.flags & SD_CLUSTER_FLAG_STRICT && (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ || hdr->opcode == SD_OP_WRITE_OBJ) && !has_enough_zones(req)) { sd_err("not enough zones available"); goto end_request; } req->work.fn = do_process_work; req->work.done = gateway_op_done; queue_work(sys->gateway_wqueue, &req->work); return; end_request: req->rp.result = SD_RES_HALT; put_request(req); return; } static void queue_local_request(struct request *req) { req->work.fn = do_process_work; req->work.done = local_op_done; queue_work(sys->io_wqueue, &req->work); } static main_fn inline void stat_request_begin(struct request *req) { struct sd_req *hdr = &req->rq; req->stat = true; if (is_peer_op(req->op)) { sys->stat.r.peer_total_nr++; sys->stat.r.peer_active_nr++; if (hdr->flags & SD_FLAG_CMD_WRITE) sys->stat.r.peer_total_rx += hdr->data_length; else sys->stat.r.peer_total_tx += hdr->data_length; switch (hdr->opcode) { case SD_OP_READ_PEER: sys->stat.r.peer_total_read_nr++; break; case SD_OP_WRITE_PEER: case SD_OP_CREATE_AND_WRITE_PEER: sys->stat.r.peer_total_write_nr++; break; case SD_OP_REMOVE_PEER: sys->stat.r.peer_total_remove_nr++; break; } } else if (is_gateway_op(req->op)) { sys->stat.r.gway_total_nr++; sys->stat.r.gway_active_nr++; if (hdr->flags & SD_FLAG_CMD_WRITE) sys->stat.r.gway_total_rx += hdr->data_length; else sys->stat.r.gway_total_tx += hdr->data_length; switch (hdr->opcode) { case SD_OP_READ_OBJ: sys->stat.r.gway_total_read_nr++; break; case SD_OP_WRITE_OBJ: case SD_OP_CREATE_AND_WRITE_OBJ: sys->stat.r.gway_total_write_nr++; break; case SD_OP_DISCARD_OBJ: sys->stat.r.gway_total_remove_nr++; break; } } else if (hdr->opcode == SD_OP_FLUSH_VDI) { sys->stat.r.gway_total_nr++; sys->stat.r.gway_active_nr++; sys->stat.r.gway_total_flush_nr++; } } static main_fn inline void stat_request_end(struct request *req) { struct sd_req *hdr = &req->rq; if (!req->stat) return; if (is_peer_op(req->op)) sys->stat.r.peer_active_nr--; else if (is_gateway_op(req->op)) sys->stat.r.gway_active_nr--; else if (hdr->opcode == SD_OP_FLUSH_VDI) sys->stat.r.gway_active_nr--; } static void queue_request(struct request *req) { struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; /* * Check the protocol version for all internal commands, and public * commands that have it set. We can't enforce it on all public * ones as it isn't a mandatory part of the public protocol. */ if (hdr->opcode >= 0x80) { if (hdr->proto_ver != SD_SHEEP_PROTO_VER) { rsp->result = SD_RES_VER_MISMATCH; goto done; } } else if (hdr->proto_ver) { if (hdr->proto_ver > SD_PROTO_VER) { rsp->result = SD_RES_VER_MISMATCH; goto done; } } req->op = get_sd_op(hdr->opcode); if (!req->op) { sd_err("invalid opcode %d", hdr->opcode); rsp->result = SD_RES_INVALID_PARMS; goto done; } sd_debug("%s, %d", op_name(req->op), sys->cinfo.status); switch (sys->cinfo.status) { case SD_STATUS_KILLED: rsp->result = SD_RES_KILLED; goto done; case SD_STATUS_SHUTDOWN: rsp->result = SD_RES_SHUTDOWN; goto done; case SD_STATUS_WAIT: if (!is_force_op(req->op)) { if (sys->cinfo.ctime == 0) rsp->result = SD_RES_WAIT_FOR_FORMAT; else rsp->result = SD_RES_WAIT_FOR_JOIN; goto done; } break; default: break; } req->vinfo = get_vnode_info(); stat_request_begin(req); if (is_peer_op(req->op)) { queue_peer_request(req); } else if (is_gateway_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_gateway_request(req); } else if (is_local_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_local_request(req); } else if (is_cluster_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_cluster_request(req); } else { sd_err("unknown operation %d", hdr->opcode); rsp->result = SD_RES_SYSTEM_ERROR; goto done; } return; done: put_request(req); } static void requeue_request(struct request *req) { if (req->vinfo) { put_vnode_info(req->vinfo); req->vinfo = NULL; } stat_request_end(req); queue_request(req); } static void clear_client_info(struct client_info *ci); static struct request *alloc_local_request(void *data, int data_length) { struct request *req; req = xzalloc(sizeof(struct request)); if (data_length) { req->data_length = data_length; req->data = data; } req->local = true; refcount_set(&req->refcnt, 1); return req; } static void free_local_request(struct request *req) { put_vnode_info(req->vinfo); free(req); } static void submit_local_request(struct request *req) { sd_mutex_lock(&sys->local_req_lock); list_add_tail(&req->request_list, &sys->local_req_queue); sd_mutex_unlock(&sys->local_req_lock); eventfd_xwrite(sys->local_req_efd, 1); } /* * Exec the request locally and synchronously. * * This function takes advantage of gateway's retry mechanism and can be only * called from worker thread. */ worker_fn int exec_local_req(struct sd_req *rq, void *data) { struct request *req; int ret; req = alloc_local_request(data, rq->data_length); req->rq = *rq; req->local_req_efd = eventfd(0, 0); if (req->local_req_efd < 0) { sd_err("eventfd failed, %m"); /* Fake the result to ask for retry */ req->rp.result = SD_RES_NETWORK_ERROR; goto out; } submit_local_request(req); eventfd_xread(req->local_req_efd); out: /* fill rq with response header as exec_req does */ memcpy(rq, &req->rp, sizeof(req->rp)); close(req->local_req_efd); ret = req->rp.result; free_local_request(req); return ret; } worker_fn struct request_iocb *local_req_init(void) { struct request_iocb *iocb = xzalloc(sizeof(*iocb)); iocb->efd = eventfd(0, EFD_SEMAPHORE); if (iocb->efd < 0) { sd_err("eventfd failed, %m"); free(iocb); return NULL; } iocb->result = SD_RES_SUCCESS; return iocb; } worker_fn int local_req_wait(struct request_iocb *iocb) { int ret; for (uint32_t i = 0; i < iocb->count; i++) eventfd_xread(iocb->efd); ret = iocb->result; close(iocb->efd); free(iocb); return ret; } struct areq_work { struct sd_req rq; void *data; struct request_iocb *iocb; int result; struct work work; }; static void local_req_async_work(struct work *work) { struct areq_work *areq = container_of(work, struct areq_work, work); areq->result = exec_local_req(&areq->rq, areq->data); } static void local_req_async_main(struct work *work) { struct areq_work *areq = container_of(work, struct areq_work, work); if (unlikely(areq->result != SD_RES_SUCCESS)) areq->iocb->result = areq->result; eventfd_xwrite(areq->iocb->efd, 1); free(areq); } worker_fn int exec_local_req_async(struct sd_req *rq, void *data, struct request_iocb *iocb) { struct areq_work *areq; areq = xzalloc(sizeof(*areq)); areq->rq = *rq; areq->data = data; areq->iocb = iocb; areq->work.fn = local_req_async_work; areq->work.done = local_req_async_main; queue_work(sys->areq_wqueue, &areq->work); iocb->count++; return SD_RES_SUCCESS; } static struct request *alloc_request(struct client_info *ci, int data_length) { struct request *req; req = zalloc(sizeof(struct request)); if (!req) return NULL; req->ci = ci; refcount_inc(&ci->refcnt); if (data_length) { req->data_length = data_length; req->data = valloc(data_length); if (!req->data) { free(req); return NULL; } } refcount_set(&req->refcnt, 1); uatomic_inc(&sys->nr_outstanding_reqs); return req; } static void free_request(struct request *req) { uatomic_dec(&sys->nr_outstanding_reqs); refcount_dec(&req->ci->refcnt); put_vnode_info(req->vinfo); free(req->data); free(req); } main_fn void put_request(struct request *req) { struct client_info *ci = req->ci; if (refcount_dec(&req->refcnt) > 0) return; stat_request_end(req); if (req->local) eventfd_xwrite(req->local_req_efd, 1); else { if (ci->conn.dead) { /* * free_request should be called prior to * clear_client_info because refcnt of ci will * be decreased in free_request. Otherwise, ci * cannot be freed in clear_client_info. */ free_request(req); clear_client_info(ci); } else { list_add_tail(&req->request_list, &ci->done_reqs); if (ci->tx_req == NULL) /* There is no request being sent. */ if (conn_tx_on(&ci->conn)) { sd_err("switch on sending flag failure, " "connection maybe closed"); /* * should not free_request(req) here * because it is already in done list * clear_client_info will free it */ clear_client_info(ci); } } } } static void rx_work(struct work *work) { struct client_info *ci = container_of(work, struct client_info, rx_work); int ret; struct connection *conn = &ci->conn; struct sd_req hdr; struct request *req; ret = do_read(conn->fd, &hdr, sizeof(hdr), NULL, 0, UINT32_MAX); if (ret) { sd_debug("failed to read a header"); conn->dead = true; return; } req = alloc_request(ci, hdr.data_length); if (!req) { sd_err("failed to allocate request"); conn->dead = true; return; } ci->rx_req = req; /* use le_to_cpu */ memcpy(&req->rq, &hdr, sizeof(req->rq)); if (hdr.data_length && hdr.flags & SD_FLAG_CMD_WRITE) { ret = do_read(conn->fd, req->data, hdr.data_length, NULL, 0, UINT32_MAX); if (ret) { sd_err("failed to read data"); conn->dead = true; } } } static void rx_main(struct work *work) { struct client_info *ci = container_of(work, struct client_info, rx_work); struct request *req = ci->rx_req; ci->rx_req = NULL; refcount_dec(&ci->refcnt); if (ci->conn.dead) { if (req) free_request(req); clear_client_info(ci); return; } if (conn_rx_on(&ci->conn)) sd_err("switch on receiving flag failure, " "connection maybe closed"); if (is_logging_op(get_sd_op(req->rq.opcode))) { sd_info("req=%p, fd=%d, client=%s:%d, op=%s, data=%s", req, ci->conn.fd, ci->conn.ipstr, ci->conn.port, op_name(get_sd_op(req->rq.opcode)), data_to_str(req->data, req->rp.data_length)); } else { sd_debug("%d, %s:%d", ci->conn.fd, ci->conn.ipstr, ci->conn.port); } queue_request(req); } static void tx_work(struct work *work) { struct client_info *ci = container_of(work, struct client_info, tx_work); int ret; struct connection *conn = &ci->conn; struct sd_rsp rsp; struct request *req = ci->tx_req; void *data = NULL; /* use cpu_to_le */ memcpy(&rsp, &req->rp, sizeof(rsp)); rsp.epoch = sys->cinfo.epoch; rsp.opcode = req->rq.opcode; rsp.id = req->rq.id; if (rsp.data_length) data = req->data; ret = send_req(conn->fd, (struct sd_req *)&rsp, data, rsp.data_length, NULL, 0, UINT32_MAX); if (ret != 0) { sd_err("failed to send a request"); conn->dead = true; } } static void tx_main(struct work *work) { struct client_info *ci = container_of(work, struct client_info, tx_work); refcount_dec(&ci->refcnt); if (is_logging_op(ci->tx_req->op)) { sd_info("req=%p, fd=%d, client=%s:%d, op=%s, result=%02X", ci->tx_req, ci->conn.fd, ci->conn.ipstr, ci->conn.port, op_name(ci->tx_req->op), ci->tx_req->rp.result); } else { sd_debug("%d, %s:%d", ci->conn.fd, ci->conn.ipstr, ci->conn.port); } free_request(ci->tx_req); ci->tx_req = NULL; if (ci->conn.dead) { clear_client_info(ci); return; } if (!list_empty(&ci->done_reqs)) if (conn_tx_on(&ci->conn)) sd_err("switch on sending flag failure, " "connection maybe closed"); } static void destroy_client(struct client_info *ci) { sd_debug("connection from: %s:%d", ci->conn.ipstr, ci->conn.port); close(ci->conn.fd); free(ci); } static void clear_client_info(struct client_info *ci) { struct request *req; sd_debug("connection seems to be dead"); list_for_each_entry(req, &ci->done_reqs, request_list) { list_del(&req->request_list); free_request(req); } unregister_event(ci->conn.fd); sd_debug("refcnt:%d, fd:%d, %s:%d", refcount_read(&ci->refcnt), ci->conn.fd, ci->conn.ipstr, ci->conn.port); if (refcount_read(&ci->refcnt)) return; destroy_client(ci); } static struct client_info *create_client(int fd, struct cluster_info *cluster) { struct client_info *ci; struct sockaddr_storage from; socklen_t namesize = sizeof(from); ci = zalloc(sizeof(*ci)); if (!ci) return NULL; if (getpeername(fd, (struct sockaddr *)&from, &namesize)) { free(ci); return NULL; } switch (from.ss_family) { case AF_INET: ci->conn.port = ntohs(((struct sockaddr_in *)&from)->sin_port); inet_ntop(AF_INET, &((struct sockaddr_in *)&from)->sin_addr, ci->conn.ipstr, sizeof(ci->conn.ipstr)); break; case AF_INET6: ci->conn.port = ntohs(((struct sockaddr_in6 *)&from)->sin6_port); inet_ntop(AF_INET6, &((struct sockaddr_in6 *)&from)->sin6_addr, ci->conn.ipstr, sizeof(ci->conn.ipstr)); break; } ci->conn.fd = fd; ci->conn.events = EPOLLIN; refcount_set(&ci->refcnt, 0); INIT_LIST_HEAD(&ci->done_reqs); return ci; } static void client_handler(int fd, int events, void *data) { struct client_info *ci = (struct client_info *)data; sd_debug("%x, %d", events, ci->conn.dead); if (events & (EPOLLERR | EPOLLHUP)) ci->conn.dead = true; /* * Although dead is true, ci might not be freed immediately * because of refcnt. Never mind, we will complete it later * as long as dead is true. */ if (ci->conn.dead) return clear_client_info(ci); if (events & EPOLLIN) { if (conn_rx_off(&ci->conn) != 0) { sd_err("switch off receiving flag failure, " "connection maybe closed"); return; } /* * Increment refcnt so that the client_info isn't freed while * rx_work uses it. */ refcount_inc(&ci->refcnt); ci->rx_work.fn = rx_work; ci->rx_work.done = rx_main; queue_work(sys->net_wqueue, &ci->rx_work); } if (events & EPOLLOUT) { if (conn_tx_off(&ci->conn) != 0) { sd_err("switch off sending flag failure, " "connection maybe closed"); return; } assert(ci->tx_req == NULL); ci->tx_req = list_first_entry(&ci->done_reqs, struct request, request_list); list_del(&ci->tx_req->request_list); /* * Increment refcnt so that the client_info isn't freed while * tx_work uses it. */ refcount_inc(&ci->refcnt); ci->tx_work.fn = tx_work; ci->tx_work.done = tx_main; queue_work(sys->net_wqueue, &ci->tx_work); } } static void listen_handler(int listen_fd, int events, void *data) { struct sockaddr_storage from; socklen_t namesize; int fd, ret; struct client_info *ci; bool is_inet_socket = *(bool *)data; if (sys->cinfo.status == SD_STATUS_SHUTDOWN) { sd_debug("unregistering connection %d", listen_fd); unregister_event(listen_fd); return; } namesize = sizeof(from); fd = accept(listen_fd, (struct sockaddr *)&from, &namesize); if (fd < 0) { sd_err("failed to accept a new connection: %m"); return; } if (is_inet_socket) { ret = set_nodelay(fd); if (ret) { close(fd); return; } } ci = create_client(fd, data); if (!ci) { close(fd); return; } ret = register_event(fd, client_handler, ci); if (ret) { destroy_client(ci); return; } sd_debug("accepted a new connection: %d", fd); } static LIST_HEAD(listening_fd_list); struct listening_fd { int fd; struct list_node list; }; static int create_listen_port_fn(int fd, void *data) { struct listening_fd *new_fd; new_fd = xzalloc(sizeof(*new_fd)); new_fd->fd = fd; list_add_tail(&new_fd->list, &listening_fd_list); return register_event(fd, listen_handler, data); } void unregister_listening_fds(void) { struct listening_fd *fd; list_for_each_entry(fd, &listening_fd_list, list) { sd_debug("unregistering fd: %d", fd->fd); unregister_event(fd->fd); } } int create_listen_port(const char *bindaddr, int port) { static bool is_inet_socket = true; return create_listen_ports(bindaddr, port, create_listen_port_fn, &is_inet_socket); } int init_unix_domain_socket(const char *dir) { static bool is_inet_socket; char unix_path[PATH_MAX]; snprintf(unix_path, sizeof(unix_path), "%s/sock", dir); unlink(unix_path); return create_unix_domain_socket(unix_path, create_listen_port_fn, &is_inet_socket); } static void local_req_handler(int listen_fd, int events, void *data) { struct request *req; LIST_HEAD(pending_list); if (events & EPOLLERR) sd_err("request handler error"); eventfd_xread(listen_fd); sd_mutex_lock(&sys->local_req_lock); list_splice_init(&sys->local_req_queue, &pending_list); sd_mutex_unlock(&sys->local_req_lock); list_for_each_entry(req, &pending_list, request_list) { list_del(&req->request_list); queue_request(req); } } void local_request_init(void) { sd_init_mutex(&sys->local_req_lock); sys->local_req_efd = eventfd(0, EFD_NONBLOCK); if (sys->local_req_efd < 0) panic("failed to init local req efd"); register_event(sys->local_req_efd, local_req_handler, NULL); } worker_fn int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf) { struct sd_rsp *rsp = (struct sd_rsp *)hdr; struct sockfd *sfd; int ret; sfd = sockfd_cache_get(nid); if (!sfd) return SD_RES_NETWORK_ERROR; ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch, MAX_RETRY_COUNT); if (ret) { sd_debug("remote node might have gone away"); sockfd_cache_del(nid, sfd); return SD_RES_NETWORK_ERROR; } ret = rsp->result; if (ret != SD_RES_SUCCESS) sd_err("failed %s, remote address: %s, op name: %s", sd_strerror(ret), addr_to_str(nid->addr, nid->port), op_name(get_sd_op(hdr->opcode))); sockfd_cache_put(nid, sfd); return ret; } bool sheep_need_retry(uint32_t epoch) { return sys_epoch() == epoch; } sheepdog-0.8.3/sheep/sheep.c000066400000000000000000000555361237656255000157250ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "sheep_priv.h" #include "trace/trace.h" #include "option.h" #define EPOLL_SIZE 4096 #define DEFAULT_OBJECT_DIR "/tmp" #define LOG_FILE_NAME "sheep.log" LIST_HEAD(cluster_drivers); static const char program_name[] = "sheep"; static const char bind_help[] = "Example:\n\t$ sheep -b 192.168.1.1 ...\n" "This tries to teach sheep listen to NIC of 192.168.1.1.\n" "\nExample:\n\t$ sheep -b 0.0.0.0 ...\n" "This tries to teach sheep listen to all the NICs available. It can be useful\n" "when you want sheep to response dog without specified address and port.\n"; static const char ioaddr_help[] = "Example:\n\t$ sheep -i host=192.168.1.1,port=7002 ...\n" "This tries to add a dedicated IO NIC of 192.168.1.1:7002 to transfer data.\n" "If IO NIC is down, sheep will fallback to non IO NIC to transfer data.\n"; static const char journal_help[] = "Available arguments:\n" "\tsize=: size of the journal in megabyes\n" "\tdir=: path to the location of the journal (default: $STORE)\n" "\tskip: if specified, skip the recovery at startup\n" "\nExample:\n\t$ sheep -j dir=/journal,size=1G\n" "This tries to use /journal as the journal storage of the size 1G\n"; static const char http_help[] = "Available arguments:\n" "\thost=: specify a host to communicate with http server (default: localhost)\n" "\tport=: specify a port to communicate with http server (default: 8000)\n" "\tswift: enable swift API\n" "Example:\n\t$ sheep -r host=localhost,port=7001,swift ...\n" "This tries to enable Swift API and use localhost:7001 to\n" "communicate with http server.\n"; static const char myaddr_help[] = "Example:\n\t$ sheep -y 192.168.1.1:7000 ...\n" "This tries to tell other nodes through what address they can talk to this\n" "sheep.\n"; static const char zone_help[] = "Example:\n\t$ sheep -z 1 ...\n" "This tries to set the zone ID of this sheep to 1 and sheepdog won't store\n" "more than one copy of any object into this same zone\n"; static const char cluster_help[] = "Available arguments:\n" "\tlocal: use local driver\n" "\tcorosync: use corosync driver\n" "\tzookeeper: use zookeeper driver, need extra arguments\n" "\n\tzookeeper arguments: address-list,tiemout=value (default as 3000)\n" "\nExample:\n\t" "$ sheep -c zookeeper:IP1:PORT1,IP2:PORT2,IP3:PORT3,timeout=1000 ...\n" "This tries to use 3 node zookeeper cluster, which can be reached by\n" "IP1:PORT1, IP2:PORT2, IP3:PORT3 to manage membership and broadcast message\n" "and set the timeout of node heartbeat as 1000 milliseconds\n"; static const char cache_help[] = "Available arguments:\n" "\tsize=: size of the cache in megabyes\n" "\tdir=: path to the location of the cache (default: $STORE/cache)\n" "\tdirectio: use directio mode for cache IO, " "if not specified use buffered IO\n" "\nExample:\n\t$ sheep -w size=200G,dir=/my_ssd,directio ...\n" "This tries to use /my_ssd as the cache storage with 200G allocted to the\n" "cache in directio mode\n"; static const char log_help[] = "Example:\n\t$ sheep -l dir=/var/log/,level=debug,format=server ...\n" "Available arguments:\n" "\tdir=: path to the location of sheep.log\n" "\tlevel=: log level of sheep.log\n" "\tformat=: log format type\n" "\tdst=: log destination type\n\n" "if dir is not specified, use metastore directory\n\n" "Available log levels:\n" " Level Description\n" " emerg system has failed and is unusable\n" " alert action must be taken immediately\n" " crit critical conditions\n" " err error conditions\n" " warning warning conditions\n" " notice normal but significant conditions\n" " info informational notices\n" " debug debugging messages\n" "default log level is info\n\n" "Available log format:\n" " FormatType Description\n" " default raw format\n" " server raw format with timestamp\n" " json json format\n\n" "Available log destination:\n" " DestinationType Description\n" " default dedicated file in a directory used by sheep\n" " syslog syslog of the system\n" " stdout standard output\n"; static struct sd_option sheep_options[] = { {'b', "bindaddr", true, "specify IP address of interface to listen on", bind_help}, {'c', "cluster", true, "specify the cluster driver (default: "DEFAULT_CLUSTER_DRIVER")", cluster_help}, {'D', "directio", false, "use direct IO for backend store"}, {'g', "gateway", false, "make the progam run as a gateway mode"}, {'h', "help", false, "display this help and exit"}, {'i', "ioaddr", true, "use separate network card to handle IO requests" " (default: disabled)", ioaddr_help}, {'j', "journal", true, "use jouranl file to log all the write " "operations. (default: disabled)", journal_help}, {'l', "log", true, "specify the log level, the log directory and the log format" "(log level default: 6 [SDOG_INFO])", log_help}, {'n', "nosync", false, "drop O_SYNC for write of backend"}, {'p', "port", true, "specify the TCP port on which to listen " "(default: 7000)"}, {'P', "pidfile", true, "create a pid file"}, {'r', "http", true, "enable http service. (default: disabled)", http_help}, {'u', "upgrade", false, "upgrade to the latest data layout"}, {'v', "version", false, "show the version"}, {'w', "cache", true, "enable object cache", cache_help}, {'y', "myaddr", true, "specify the address advertised to other sheep", myaddr_help}, {'z', "zone", true, "specify the zone id (default: determined by listen address)", zone_help}, { 0, NULL, false, NULL }, }; static void usage(int status) { if (status) { const char *help = option_get_help(sheep_options, optopt); if (help) { printf("%s", help); goto out; } sd_err("Try '%s --help' for more information.", program_name); } else { struct sd_option *opt; printf("Sheepdog daemon (version %s)\n" "Usage: %s [OPTION]... [PATH] (default: /tmp)\n" "Options:\n", PACKAGE_VERSION, program_name); sd_for_each_option(opt, sheep_options) { printf(" -%c, --%-18s%s\n", opt->ch, opt->name, opt->desc); } printf("\nTry '%s