pax_global_header00006660000000000000000000000064125450714670014525gustar00rootroot0000000000000052 comment=219a146246616263388fbfb2bb6a9fba011bccfa flashcache-3.1.3+git20150701/000077500000000000000000000000001254507146700152115ustar00rootroot00000000000000flashcache-3.1.3+git20150701/LICENSE000066400000000000000000000432541254507146700162260ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. flashcache-3.1.3+git20150701/Makefile000066400000000000000000000016621254507146700166560ustar00rootroot00000000000000COMMIT_REV := $(shell git describe --always --abbrev=12) KERNEL_TREE ?= /lib/modules/$(shell uname -r)/build export COMMIT_REV # Check for RHEL/CentOS RHEL5_VER ?= $(shell if [ -e /etc/redhat-release ]; then grep 5.[0-9] /etc/redhat-release; else false; fi) ifneq "$(RHEL5_VER)" "" RHEL5_TREE := /usr/src/redhat/BUILD/kernel-2.6.18/linux-$(shell uname -r).$(shell uname -i) KERNEL_TREE := $(RHEL5_TREE) endif # Check for OpenVZ (/proc/vz) OPENVZ_VER ?= $(shell if [ -e /proc/vz ]; then grep 5.[0-9] /etc/redhat-release; else false; fi) ifneq "$(OPENVZ_VER)" "" RHEL5_TREE := /usr/src/redhat/BUILD/ovzkernel-2.6.18/linux-$(shell uname -r).$(shell uname -i) KERNEL_TREE := $(RHEL5_TREE) endif all: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(shell pwd)/src all install: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(shell pwd)/src install clean: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(shell pwd)/src clean flashcache-3.1.3+git20150701/Makefile.dkms000066400000000000000000000025371254507146700176150ustar00rootroot00000000000000COMMIT_REV := $(shell git describe --always --abbrev=12) DKMS_DEST ?= /var/lib/dkms/flashcache/$(COMMIT_REV)/source KERNEL_VERSION ?= $(shell uname -r) # statically link for inclusion in initramfs CFLAGS += -static export COMMIT_REV CFLAGS UTILS := /sbin/flashcache_load /sbin/flashcache_create /sbin/flashcache_destroy .PHONY: all all: install .PHONY: install install: build utils dkms install -m flashcache -v $(COMMIT_REV) -k $(KERNEL_VERSION) .PHONY: build build: src_install dkms build -m flashcache -v $(COMMIT_REV) -k $(KERNEL_VERSION) .PHONY: src_install src_install: install -o root -g root -m 0755 -d $(DKMS_DEST) rsync -r src/ $(DKMS_DEST)/ sed "s/PACKAGE_VERSION=/PACKAGE_VERSION=$(COMMIT_REV)/" src/dkms.conf > "$(DKMS_DEST)/dkms.conf" .PHONY: boot_conf boot_conf: utils # do mkinitramfs install if present if test -d /etc/initramfs-tools/hooks; then \ install -o root -g root src/flashcache.hook /etc/initramfs-tools/hooks/flashcache ; \ install -o root -g root src/utils/flashcache_scan /etc/initramfs-tools/scripts/init-premount/ ; \ update-initramfs -k $(shell uname -r) -u ; \ fi .PHONY: utils utils: $(MAKE) -C src/utils install .PHONY: clean clean: $(MAKE) -f Makefile clean dkms remove -m flashcache -v $(COMMIT_REV) --all rm /etc/initramfs-tools/hooks/flashcache rm /etc/initramfs-tools/scripts/init-premount/flashcache_scan flashcache-3.1.3+git20150701/README000066400000000000000000000014501254507146700160710ustar00rootroot00000000000000License : Everything in flashcache components released under GPL v2. Update : Added the flashcache-wt directory, which contains a (separate) module for a write through flashcache. For more details, look at flashcache-wt/README. Building Flashcache : ------------------- Update : Vadim Tkachenko (with help from Mohan), ported Flashcache to later Linux releases. We now have built Flashcache on 2.6.18, 2.6.20 and 2.6.27-32. We have tested Flashcache on 2.6.18, 2.6.20, 2.6.27 and 2.6.32. You'll need the entire kernel source tree in order to build flashcache. At the top directory, run make KERNEL_TREE= This builds both the flashcache.ko and 3 utilities. flascache-sa-guide.txt has details on how to create and load flashcache volumes. Mohan Srinivasan Paul Saab flashcache-3.1.3+git20150701/README-CentOS5000066400000000000000000000014731254507146700173140ustar00rootroot00000000000000Vadim T (Percona) wrote an initial guide on getting flashcache building on CentOS, which was extremely helpful. Based on that, I've modified the Makefiles to make building on CentOS as easy as possible. Here's the steps you'll need to follow to build on CentOS: - Make sure you have the CentOS EPEL repo. (http://fedoraproject.org/wiki/EPEL) - Make sure you have the CentOS source repo. (http://mirror.centos.org/centos/$releasever/updates/SRPMS/) - Install prerequisite build packages: yum install dkms gcc make yum-utils kernel-devel - CentOS kernel-headers/devel packages don't include internal headers, we've got to configure the full source: yumdownloader --source kernel-`uname -r` sudo rpm -ivh kernel-`uname -r`.src.rpm Now the updated Makefiles should work correctly. Graeme Humphries flashcache-3.1.3+git20150701/README-CentOS6000066400000000000000000000021311254507146700173050ustar00rootroot00000000000000Vadim T (Percona) wrote an initial guide on getting flashcache building on CentOS, which was extremely helpful. Based on that, I've modified the Makefiles to make building on CentOS as easy as possible. Here's the steps you'll need to follow to build on CentOS: - Make sure you have the CentOS EPEL repo. (http://fedoraproject.org/wiki/EPEL) - Make sure you have the CentOS source repo. (http://mirror.centos.org/centos/$releasever/updates/SRPMS/) - Install prerequisite build packages: yum install dkms gcc make yum-utils kernel-devel - CentOS kernel-headers/devel packages don't include internal headers, we've got to configure the full source: yumdownloader --source kernel-`uname -r` sudo rpm -ivh kernel-`uname -r`.src.rpm - For CentOS6.x boot support: yum localinstall utils/dracut-flashcache-0.3-1.el6.noarch.rpm follow the instructions in doc/dracut-flashcache.txt later on if you want flashcache to accelerate your root device or lvm volumes skip the "Boot from Flashcache" part in README-DKMS Now the updated Makefiles should work correctly. Graeme Humphries flashcache-3.1.3+git20150701/README-DKMS000066400000000000000000000040351254507146700166270ustar00rootroot00000000000000DKMS Module Howto: ------------------ A number of modern distros (Ubuntu, Fedora and others) can use DKMS to help sensibly manage building and maintaining additional third party kernel modules. http://en.wikipedia.org/wiki/Dynamic_Kernel_Module_Support I've added some simple DKMS configs for flashcache to make building and managing it a little less painful. This has been updated with a Makefile.dkms, which should hopefully do a sensible DKMS install without intervention. Right now requires initramfs-tools for boot support, so it probably won't work on CentOS 5 just yet. Installation ------------ Ubuntu (and probably Debian): - Install prerequisite build packages: apt-get install dkms build-essential linux-headers-`uname -r` CentOS/RHEL 5: - Follow the steps in README-CentOS5 to prep for building on CentOS. Run "make -f Makefile.dkms", and it should build and install correctly for your kernel. Boot from Flashcache -------------------- If you have initramfs-tools, then flashcache utils and modules can be installed into the initramfs. - Follow Installation section above. - Install flashcache boot support: make -f Makefile.dkms boot_conf - Edit your /etc/fstab and /boot/grub/grub.cfg to change the root device to the flashcache device you intend to use. (ex: /dev/mapper/fc-root) - Boot off a live CD like the Ubuntu desktop installer. - Mount your root device (mount /dev/root-partition /mnt) - cd to your flashcache source directory, and install flashcache into the boot environment. (make install) - Unmount your root device (umount /mnt) - Create flashcache device backed by your root device. I recommend using the UUID path in case device renaming occurs: flashcache_create -p back fc-root /dev/ssd_device /dev/disk/by-uuid/root_device_uuid - Reboot! Known Issues: ------------- - Right now grub-probe fails to detect the underlying root device when booting off flashcache, so you'll have to edit /boot/grub/grub.cfg manually. - Because of previous point, update-grub will likely fail. Graeme Humphries flashcache-3.1.3+git20150701/doc/000077500000000000000000000000001254507146700157565ustar00rootroot00000000000000flashcache-3.1.3+git20150701/doc/dracut-flashcache.txt000066400000000000000000000235761254507146700220750ustar00rootroot00000000000000dracut-flashcache version 0.3 ============================= This is a dracut module which will enable you to use flashcache on your root (/) filesystem or LVM PV. Written by John Newbigin Altered by Jeroen Beerstra It has been written and tested on CentOS-6 and should work the same on RHEL6. It will probably work on Fedora but they might have changed things. WARNING ======= A mistake here could delete all your data. Be careful! Preparation =========== You don't actually need a Solid State Drive (SSD), any disk will work. These instructions will use the term SSD to represent your selected 'cache' disk. Start by physically installing your disk and make sure it is detected (in /proc/partitions). Do a backup of your system. If you accidently get the SSD and the HDD round the wrong way all your data could go in a flash (no pun intended). You will need the following installed * flashcache from github: https://github.com/facebook/flashcache * dracut-flashcache (this package) - This will use the flashcache-utils during boot http://www.chrysocome.net/download Planning ======== 1. Choose where to store your cache. I chose to partition my SSD. This is not necessary but I want to use it to cache two separate LVM groups so I need two partitions. I use partition type '0xda Non-FS data' but there is no standard and it does not really matter. 2. Choose what to store in your cache. If you are not using LVM then you probably want to cache an entire partition (or disk? TODO: can a flashcache device have partitions?). Chances are for a simple setup it is /dev/sda2 If you are using LVM then you can choose to cache a Physical Volume or Logical Volume. If you only have one PV then that is a handy thing to cache because your filesystem device names will be the same so you have less configuration to do. If you have multiple PV then you can cache one LV instead. This will speed up all of your PV at the same time. Your filesystem device will change which might mean more work. If you have multiple PV and multiple LV or just want to have multiple caches, not to worry, just partition your SSD and set them up one at a time. Setup ===== 1. Edit /etc/lvm/lvm.conf and blacklist your SSD with a command like this: filter = [ "r|/dev/sdb|" ] # This will prevent LVM from seeing the signature at the start of the device and thinking it should scan this device (this is a bad thing) # We are relying on the regex to match all partitions on the disk as well as the disk # (Do not include the "a/.*/" in your filter or your r will not be processed) 2. Build a new initd which will contain your updated lvm.conf mkinitrd -f /boot/initramfs-$(uname -r).img $(uname -r) 3. Edit your grub.conf and add this to the kernel line: rd_FLASHCACHE=/dev/HDD:/dev/SSD:fc_HDD 4. In grub.conf, if you have root=/dev/HDD then change it so that root=/dev/mapper/fc_HDD Reboot! Once you boot up successfuly, it will still take some time for your recent disk access to fill the cache. Don't expect instant results. Advanced Setup ============== Once you have writethru caching working you can try writeback by editing grub.conf to have rd_FLASHCACHE=/dev/HDD:/dev/SSD:fc_HDD:back This will keep your cache over a reboot. This gives faster boot times but has some risks associated with it. Uninstalling ============ If you set up a writeback cache and you then want to remove it, you can safly do this by editing grub.conf and setting it to type none. After a reboot it will be gone and then you can edit grub.conf and remove the rd_FLASHCACHE= option totally. Examples ======== Using basic LVM and want to cache your PV root=/dev/mapper/vg0-lv_root rd_FLASHCACHE=/dev/sda2:/dev/sdb:fc_sda2 Using basic LVM and want to cache your root LV only: root=/dev/mapper/fc_root rd_FLASHCACHE=/dev/mapper/vg0-lv_root:dev/sdb:fc_root Using software RAID and want to cache the raid dev: rd_FLASHCACHE=/dev/md0:/dev/sdb:fc_md0 Notes ===== I don't think we need to lvm blacklist the real disk. Once the flashcache is loaded, lvm won't be able to use the device. This allows a fall-back in the case that the flashcache does not load. Only if you have a dirty writeback cache will this be a problem... (and it could be a BIG problem, particularly if you flush the cache in the future after you have done an fsck on the real disk device!) My examples use fc_... as the cache name. I think this helps to remember where the data is coming from. The actual string you use is up to you. Writeback mode does not have any on disk header/signature so there is no safety if you make a mistake with your device names. Be careful. What if you want flashcache for a disk which is not needed at boot time? Should we have a config file or should there be a run time udev rule to read the command line? For now, specify them all in grub.conf and the runtime udev rule will process them once your disk subsystem is ready so you can use software RAID etc. dracut-flashcache Copyright (C) 2012 John Newbigin This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. == Old Notes== dracut module for flashcache Written by John Newbigin jnewbigin@chrysocome.net This will enable you to use the elrepo packaged flashcache module with your root filesystem and/or physical volumes. Step : Install the required software yum install flashcache-utils kmod-flashcache dracut-flashcache Step : Blacklist your SSD To prevent LVM from getting confused you should configure it so that it never tries to find physical volumes on your ssd disk or partitions (depending on your setup). To do this, edit /etc/lvm/lvm.conf and change your 'filter =' entry. My ssd is /dev/sdd so my filter looks like this: filter = [ "r|/dev/sdd|" ] I could also do this: filter = [ "r|$/dev/sdd1^|", "r|$/dev/sdd2^|" ] If I wanted to be more specific. (Do not include the "a/.*/" or your r will not be processed) I don't think we need to blacklist the real disk. Once the flashcache is loaded, lvm won't be able to use the device. This allows a fallback in the case that the flashcache does not load. Only if you have a dirty writeback cache will this be a problem..... Step : Build a new initrd We need to get the flashcache files and your new lvm config into the initrd mkinitrd -f /boot/initramfs-$(uname -r).img $(uname -r) Step : Edit grub.conf You can of course do this from grub but it is much easier to do with a text editor. Add this to the end of your kernel line: rd_FLASHCACHE=/dev/my_real_disk:/dev/my_ssd:fc_new_name You must substitute the correct values for my_real_disk, my_ssd and fc_new_name my_real_disk is where you store your data. It might be a disk or partition or a logical volume. eg: /dev/sda2 (partition) my_ssd is your high speed disk (probably an ssd). eg: /dev/sdb1 (a partition) fc_new_name is the name used to access the cached device. I recommend fc_ followed by the original name. eg: fc_sda2 (don't use /dev here) Note: it is possible for disk names to change so it might be safer to use a unique name for your devices, something from /dev/disk/by-id/ Unfortunatly I use : as the seperator so you can't use /dev/disk/by-path/ There is also an optional 4th parameter which I will cover below. Step : Reboot Using write back ================ The default mode is writethrough (or thru) which will ensure that your data is safely stored on your real disk. This is the safest option because if you have a crash/powerfail, ssd fail or boot problem your data is safe. For better write performance and read performance from boot, you can enable writeback mode. This is relativly safe. The problem is if you crash or powerfail and then have an ssd fail or boot problem then you can loose data. This may just cause a loss of recent changes but it could also cause filesystem corruption and a total loss of everything. (What would happen if: boot with writeback. crash. Boot without any flashcache & repair filesystem (say you accidentially boot into a live CD). Then reboot and re-activate your dirty writeback. Stale data is now written onto your disk causing fresh corruption). Don't enable this until you know that: - You can boot/reboot succssfully with write thru - You have a UPS - You have backups of your data To enable writeback mode, add :back to the end of your rd_FLASHCACHE settings. You can change this in the future and revert to thru but you must do a clean reboot to correctly remove the writeback cache so you don't loose data. ie.i To remove a writeback cache: * Boot with :back * Do a clean shut down * Boot with :thru * shut down * Boot with no rd_FLASHCACHE Finally, if you enable fast_remove, every reboot may (will?) leave data in the cache only so you must reboot in order to save your valuable data. In this configuration you can't change the type to thru or you loose your data. First, disable fast_remove, then reboot, then reboot again and remove the writeback cache. HDD SSD MODE ACTION x x - thru x x new thru x x - back x x new back x x - auto x x new auto x x - none x x back back auto x back back x x back thru x x back auto auto x back auto x x back none x x new thru x x new back x x new auto x x new none flashcache_init /dev/ssd Write a header to the SSD to identify it as a candidate for use at create time. flashcache_info /dev/xxx Query the dev: - N/A - Clean - Dirty - Fastclean? - Unstable? - New flashcache-3.1.3+git20150701/doc/flashcache-doc.txt000066400000000000000000000347721254507146700213600ustar00rootroot00000000000000 Flashcache : A Write Back Block Cache for Linux Author: Mohan Srinivasan ----------------------------------------------- Introduction : ============ Flashcache is a write back block cache Linux kernel module. This document describes the design, futures ideas, configuration, tuning of the flashcache and concludes with a note covering the testability hooks within flashcache and the testing that we did. Flashcache was built primarily as a block cache for InnoDB but is general purpose and can be used by other applications as well. Design : ====== Flashcache is built using the Linux Device Mapper (DM), part of the Linux Storage Stack infrastructure that facilitates building SW-RAID and other components. LVM, for example, is built using the DM. The cache is structured as a set associative hash, where the cache is divided up into a number of fixed size sets (buckets) with linear probing within a set to find blocks. The set associative hash has a number of advantages (called out in sections below) and works very well in practice. The block size, set size and cache size are configurable parameters, specified at cache creation. The default set size is 512 (blocks) and there is little reason to change this. In what follows, dbn refers to "disk block number", the logical device block number in sectors. To compute the target set for a given dbn target set = (dbn / block size / set size) mod (number of sets) Once we have the target set, linear probe within the set finds the block. Note that a sequential range of disk blocks will all map onto a given set. The DM layer breaks up all IOs into blocksize chunks before passing the IOs down to the cache layer. By default, flashcache caches all full blocksize IOs, but can be configured to only cache random IO whilst ignoring sequential IO. Replacement policy is either FIFO or LRU within a cache set. The default is FIFO but policy can be switched at any point at run time via a sysctl (see the configuration and tuning section). To handle a cache read, compute the target set (from the dbn), linear search for the dbn in the set. In the case of a cache hit, the read is serviced from flash. For a cache miss, the data is read from disk, populated into flash and the data returned from the read. Since the cache is writeback, a write only writes to flash, synchronously updates the cache metadata (to mark the cache block as dirty) and completes the write. On a block re-dirty, the metadata update is skipped. It is important to note that in the first cut, cache writes are non-atomic, ie, the "Torn Page Problem" exists. In the event of a power failure or a failed write, part of the block could be written, resulting in a partial write. We have ideas on how to fix this and provide atomic cache writes (see the Futures section). Each cache block has on-flash metadata associated with it for cache persistence. This per-block metadata consists of the dbn (disk block cached in this slot) and flags (DIRTY, VALID, INVALID). Cache metadata is only updated on a write or when a cache block is cleaned. The former results in the state being marked DIRTY and the latter results in the state being marked ~DIRTY. To minimize small flash writes, cache block metadata is not updated in the read path. In addition, we also have a on-flash cache superblock, which contains cache parameters (read on a cache reload) and whether the cache shutdown was clean (orderly) or unclean (node crash, power failure etc). On an clean cache shutdown, metadata for all cache blocks is written out to flash. After an orderly shutdown, both VALID and DIRTY blocks will persist on a subsequent cache reload. After a node crash or a power failure, only DIRTY cache blocks will persist on a subsequent cache reload. Node crashes or power failures will not result in data loss, but they will result in the cache losing VALID and non-DIRTY cached blocks. Cache metadata updates are "batched" when possible. So if we have pending metadata updates to multiple cache blocks which fall on the same metadata sector, we batch these updates into 1 flash metadata write. When a file is written sequentially, we will commonly be able to batch several metadata updates (resulting from sequential block writes) into 1 cache metadata update. Dirty cache blocks are written lazily to disk in the background. Flashcache's lazy writing is controlled by a configurable dirty threshold (see the configuration and tunings section). Flashcache strives to keep the percentage of dirty blocks in each set below the dirty threshold. When the dirty blocks in a set exceeds the dirty threshold, the set is eligible for cleaning. Dirty blocks are also cleaned based on "idleness" By defalt a dirty block not read or written for 15 minutes (dev.flashcache.fallow_delay) will be cleaned. To disable idle cleaning set that value to 0. A 2 handed clocklike algorithm is used to pick off fallow dirty blocks to clean. DIRTY blocks are selected for cleaning based on the replacement policy (FIFO vs LRU). Once we have a target set of blocks to clean, we sort these blocks, search for other contigous dirty blocks in the set (which can be cleaned for free since they'll be merged into a large IO) and send the writes down to the disk. As mentioned earlier, the DM will break IOs into blocksize pieces before passing them on to flashcache. For smaller (than blocksize) IOs or IOs that straddle 2 cache blocks, we pass the IO directly to disk. But before doing so, we invalidate any cacheblocks that overlap the IO. If the overlapping cacheblocks are DIRTY we clean those cacheblocks and pass the new overlapping IO do disk after those are successfully cleaned. Invalidating cacheblocks for IOs that overlap 2 cache blocks is easy with a set associative hash, we need to search for overlaps precisely in 2 cache sets. Flashcache has support for block checksums, which are computed on cache population and validated on every cache read. Block checksums is a compile switch, turned off by default because of the "Torn Page" problem. If a cache write fails after part of the block was committed to flash, the block checksum will be wrong and any subsequent attempt to read that block will fail (because of checksum mismatches). How much cache metadata overhead do we incur ? For each cache block, we have in-memory state of 24 bytes (on 64 bit architectures) and 16 bytes of on-flash metadata state. For a 300GB cache with 16KB blocks, we have approximately 20 Million cacheblocks, resulting in an in-memory metadata footprint of 480MB. If we were to configure a 300GB cache with 4KB pages, that would quadruple to 1.8GB. It is possible to mark IOs issued by particular pids as noncacheable via flashcache ioctls. If a process is about to scan a large table sequentially (for a backup say), it can mark itself as non-cacheable. For a read issued by a "non cacheable" process, if the read results in a cache hit, the data is served from cache. If the read results in a cache miss, the read is served directly from disk (without a cache population). For a write issued by a non cacheable process, the write is sent directly to disk. But before that is done, we invalidate any overlapping cache blocks (cleaning them first if necessary). A few things to note about tagging pids non-cacheable. First, this only really works reliably with Direct IO. For buffered IO, writes will almost always happen from kernel threads (eg pdflush). So writes will continue to be cached. For most filesystems, these ioctls will make buffered reads uncached - readaheads will be kicked off the filemap code, so the readaheads will be kicked off from the same context as the reads. If a process that marked itself non-cacheable dies, flashcache has no way of cleaning up (the Linux kernel doesn't have a at_exit() hook). Applications have to work around this (see configuration below). The cleanup issue can be fixed by making the cache control aspect of flashcache a pseudo-filesystem so that the last close of the fd on process exit cleans things up (see Futures for more details). In spite of the limitations, we think the ability to mark Direct IOs issued by a pid will be valuable to prevent backups from wiping out the cache. Alternatively, rather than specifically marking pids as non-cacheable, users may wish to experiment with the sysctl 'skip_seq_thresh_kb' which disables caching of IO determined to be sequential, above a configurable threshold of consecutive reads or writes. The algorithm to spot sequential IO has some ability to handle multiple 'flows' of IO, so it should, for example, be able to skip caching of IOs of two flows of sequential reads or writes, but only cache IOs from a third random IO flow. Note that multiple small files may be written to consecutive blocks. If these are written out in a batch (e.g. by an untar), this may appear as a single sequential write, hence these multiple small files will not be cached. The categorization of IO as sequential or random occurs purely at the block level, not the file level. (For a more detailed discussion about caching controls, see the SA Guide). Futures and Features : ==================== Cache Mirroring : --------------- Mirroring the cache across 2 physical flash devices should work without any code changes. Since the cache device is a block device, we can build a RAID-1 block device out of the 2 physical flash devices and use that as our cache device. (I have not yet tested this). Cache Resizing : -------------- The easiest way to resize the cache is to bring the cache offline, and then resize. Resizing the cache when active is complicated and bug prone. Integration with ATA TRIM Command : --------------------------------- The ATA TRIM command was introduced as a way for the filesystem to inform the ssd that certain blocks were no longer in use to faciliate improved wear levelling algorithms in the ssd controller. Flashcache can leverage this as well. We can simply discard all blocks falling within a TRIM block range from the cache regardless of the state, since they are no longer needed. Deeper integration with filesystems : ----------------------------------- Non-cacheability could be much better implemented with a deeper integration of flashcache and the filesystem. The filesystem could easily tag IOs as non-cacheable, based on user actions. Fixing the "Torn Page Problem" (make Cache Writes atomic) : --------------------------------------------------------- As mentioned above, cache block writes are non-atomic. If we have a power failure or if the flash write fails, part of the block (a few sectors) could be written out, corrupting the block. In this respect, flashcache behaves no different from disk. We have ideas on how to fix this and achieve atomic cache block writes using shadow paging techniques. Mark C. says that we could avoid doublebuffer writes if we have atomic cache block writes. However, with flashcache, doublebuffer writes will all be absorbed by the flash (and we should get excellent write hits/overwrites for doublebuffer blocks so they would never hit disk). So it is not clear how much of a win atomic cache block writes will be. It is however a handy feature to provide. If we have atomic block writes we could also enable cache block checksums. There are broadly 3 ways to fix this. 1) If the flash device offers configurable sector sizes, configure it to match the cache block size (FusionIO offers upto a 4KB configurable sector size). 2) If we choose a shadow page that falls in the same metadata sector as the page being overwritten, we can do the shadow page write and switch the metadata atomically. 3) If we don't want the restriction that the shadow page and the page overwritten are part of the same metadata sector, to allow us to pick a shadow page more freely across the cache set, we would need to introduce a monotonically increasing timestamp per write in the cache metadata that will allow us to disambiguate dirty blocks in the event of a crash. Breaking up the cache spinlock : ------------------------------ All cache state is protected by a single spinlock. Currently CPU utilization in the cache routines is very low, and there is no contention on this spinlock. That may change in the future. Make non-cacheability more robust : --------------------------------- The non-cacheability aspect need fixing in terms of cleanup when a process dies. Probably the best way to approach this is to approach this in a pseudo filesystemish way. Several other implementation TODOs/Futures are documented in the code. Testing and Testability : ======================= Stress Tester : ------------- I modified NetApps open source sio load generator, adding support for data verification to it with block checksums maintained in an mmap'ed file. I've been stress testing the cache with this tool. We can vary the read/write mix, seq/rand IO mix, block size, direct IO vs buffered IO, number of IO threads etc with this tool. In addition, I've used other workloads to stress test flashcache. Error Injection : --------------- I've added hooks for injecting all kinds of errors into the flashcache code (flash IO errors, disk IO errors, various kernel memory allocation errors). The error injection can be controlled by a sysctl "error_inject". Writing the following flags into "error_inject" causes the next event of that type to result in an error. The flag is cleared after the error is simulated. So we'd need to set the flag for each error we'd like to simulate. /* Error injection flags */ #define READDISK_ERROR 0x00000001 #define READCACHE_ERROR 0x00000002 #define READFILL_ERROR 0x00000004 #define WRITECACHE_ERROR 0x00000008 #define WRITECACHE_MD_ERROR 0x00000010 #define WRITEDISK_MD_ERROR 0x00000020 #define KCOPYD_CALLBACK_ERROR 0x00000040 #define DIRTY_WRITEBACK_JOB_ALLOC_FAIL 0x00000080 #define READ_MISS_JOB_ALLOC_FAIL 0x00000100 #define READ_HIT_JOB_ALLOC_FAIL 0x00000200 #define READ_HIT_PENDING_JOB_ALLOC_FAIL 0x00000400 #define INVAL_PENDING_JOB_ALLOC_FAIL 0x00000800 #define WRITE_HIT_JOB_ALLOC_FAIL 0x00001000 #define WRITE_HIT_PENDING_JOB_ALLOC_FAIL 0x00002000 #define WRITE_MISS_JOB_ALLOC_FAIL 0x00004000 #define WRITES_LIST_ALLOC_FAIL 0x00008000 #define MD_ALLOC_SECTOR_ERROR 0x00010000 I then use a script like this to simulate errors under heavy IO load. #!/bin/bash for ((debug = 0x00000001 ; debug<=0x00010000 ; debug=debug*2)) do echo $debug >/proc/sys/dev/flashcache/error_inject sleep 1 done Acknowledgements : ================ I would like to thank Bob English for doing a critical review of the design and the code of flashcache, for discussing this in detail with me and providing valuable suggestions. The option to detect and skip sequential IO was added by Will Smith. flashcache-3.1.3+git20150701/doc/flashcache-sa-guide.txt000066400000000000000000000551561254507146700223100ustar00rootroot00000000000000 FlashCache System Administration Guide -------------------------------------- Introduction : ============ Flashcache is a block cache for Linux, built as a kernel module, using the Device Mapper. Flashcache supports writeback, writethrough and writearound caching modes. This document is a quick administration guide to flashcache. Requirements : ============ Flashcache has been tested on variety of kernels between 2.6.18 and 2.6.38. If you'd like to build and use it on a newer kernel, please send me an email and I can help. I will not support older than 2.6.18 kernels. Choice of Caching Modes : ========================= Writethrough - safest, all writes are cached to ssd but also written to disk immediately. If your ssd has slower write performance than your disk (likely for early generation SSDs purchased in 2008-2010), this may limit your system write performance. All disk reads are cached (tunable). Writearound - again, very safe, writes are not written to ssd but directly to disk. Disk blocks will only be cached after they are read. All disk reads are cached (tunable). Writeback - fastest but less safe. Writes only go to the ssd initially, and based on various policies are written to disk later. All disk reads are cached (tunable). Writeonly - variant of writeback caching. In this mode, only incoming writes are cached. No reads are ever cached. Cache Persistence : ================= Writethrough and Writearound caches are not persistent across a device removal or a reboot. Only Writeback caches are persistent across device removals and reboots. This reinforces 'writeback is fastest', 'writethrough is safest'. Known Bugs : ============ See https://github.com/facebook/flashcache/issues and report new issues there please. Data corruption has been reported when using a loopback device for the cache device. See also the 'Futures and Features' section of the design document, flashcache-doc.txt. Cache creation and loading using the flashcache utilities : ========================================================= Included are 3 utilities - flashcache_create, flashcache_load and flashcache_destroy. These utilities use dmsetup internally, presenting a simpler interface to create, load and destroy flashcache volumes. It is expected that the majority of users can use these utilities instead of using dmsetup. flashcache_create : Create a new flashcache volume. flashcache_create [-v] -p back|around|thru [-s cache size] [-w] [-b block size] cachedevname ssd_devname disk_devname -v : verbose. -p : cache mode (writeback/writethrough/writearound). -s : cache size. Optional. If this is not specified, the entire ssd device is used as cache. The default units is sectors. But you can specify k/m/g as units as well. -b : block size. Optional. Defaults to 4KB. Must be a power of 2. The default units is sectors. But you can specify k as units as well. (A 4KB blocksize is the correct choice for the vast majority of applications. But see the section "Cache Blocksize selection" below). -f : force create. by pass checks (eg for ssd sectorsize). -w : write cache mode. Only writes are cached, not reads -d : disk associativity, within each cache set, we store several contigous disk extents. Defaults to off. Examples : flashcache_create -p back -s 1g -b 4k cachedev /dev/sdc /dev/sdb Creates a 1GB writeback cache volume with a 4KB block size on ssd device /dev/sdc to cache the disk volume /dev/sdb. The name of the device created is "cachedev". flashcache_create -p thru -s 2097152 -b 8 cachedev /dev/sdc /dev/sdb Same as above but creates a write through cache with units specified in sectors instead. The name of the device created is "cachedev". flashcache_load : Load an existing writeback cache volume. flashcache_load ssd_devname [cachedev_name] Example : flashcache_load /dev/sd Load the existing writeback cache on /dev/sdc, using the virtual cachedev_name from when the device was created. If you're upgrading from an older flashcache device format that didn't store the cachedev name internally, or you want to change the cachedev name use, you can specify it as an optional second argument to flashcache_load. For writethrough and writearound caches flashcache_load is not needed; flashcache_create should be used each time. flashcache_destroy : Destroy an existing writeback flashcache. All data will be lost !!! flashcache_destroy ssd_devname Example : flashcache_destroy /dev/sdc Destroy the existing cache on /dev/sdc. All data is lost !!! For writethrough and writearound caches this is not necessary. Removing a flashcache volume : ============================ Use dmsetup remove to remove a flashcache volume. For writeback cache mode, the default behavior on a remove is to clean all dirty cache blocks to disk. The remove will not return until all blocks are cleaned. Progress on disk cleaning is reported on the console (also see the "fast_remove" flashcache sysctl). A reboot of the node will also result in all dirty cache blocks being cleaned synchronously (again see the note about "fast_remove" in the sysctls section). For writethrough and writearound caches, the device removal or reboot results in the cache being destroyed. However, there is no harm is doing a 'dmsetup remove' to tidy up before boot, and indeed this will be needed if you ever need to unload the flashcache kernel module (for example to load an new version into a running system). Example: dmsetup remove cachedev This removes the flashcache volume name cachedev. Cleaning all blocks prior to removal. Cache Stats : =========== Use 'dmsetup status' for cache statistics. 'dmsetup table' also dumps a number of cache related statistics. Examples : dmsetup status cachedev dmsetup table cachedev Flashcache errors are reported in /proc/flashcache//flashcache_errors Flashcache stats are also reported in /proc/flashcache//flashcache_stats for easier parseability. Using Flashcache sysVinit script (Redhat based systems): ======================================================= Kindly note that, this sections only applies to the Redhat based systems. Use 'utils/flashcache' from the repository as the sysvinit script. This script is to load, unload and get statistics of an existing flashcache writeback cache volume. It helps in loading the already created cachedev during system boot and removes the flashcache volume before system halt happens. This script is necessary, because, when a flashcache volume is not removed before the system halt, kernel panic occurs. Configuring the script using chkconfig: 1. Copy 'utils/flashcache' from the repo to '/etc/init.d/flashcache' 2. Make sure this file has execute permissions, 'sudo chmod +x /etc/init.d/flashcache'. 3. Edit this file and specify the values for the following variables SSD_DISK, BACKEND_DISK, CACHEDEV_NAME, MOUNTPOINT, FLASHCACHE_NAME 4. Modify the headers in the file if necessary. By default, it starts in runlevel 3, with start-stop priority 90-10 5. Register this file using chkconfig 'chkconfig --add /etc/init.d/flashcache' Cache Blocksize selection : ========================= Cache blocksize selection is critical for good cache utilization and performance. A 4KB cache blocksize for the vast majority of workloads (and filesystems). Cache Metadata Blocksize selection : ================================== This section only applies to the writeback cache mode. Writethrough and writearound modes store no cache metadata at all. In Flashcache version 1, the metadata blocksize was fixed at 1 (512b) sector. Flashcache version 2 removes this limitation. In version 2, we can configure a larger flashcache metadata blocksize. Version 2 maintains backwards compatibility for caches created with Version 1. For these cases, a metadata blocksize of 512 will continue to be used. flashcache_create -m can be used to optionally configure the metadata blocksize. Defaults to 4KB. Ideal choices for the metadata blocksize are 4KB (default) or 8KB. There is little benefit to choosing a metadata blocksize greater than 8KB. The choice of metadata blocksize is subject to the following rules : 1) Metadata blocksize must be a power of 2. 2) Metadata blocksize cannot be smaller than sector size configured on the ssd device. 3) A single metadata block cannot contain metadata for 2 cache sets. In other words, with the default associativity of 512 (with each cache metadata slot sizing at 16 bytes), the entire metadata for a given set fits in 8KB (512*16b). For an associativity of 512, we cannot configure a metadata blocksize greater than 8KB. Advantages of choosing a larger (than 512b) metadata blocksize : - Allows the ssd to be configured to larger sectors. For example, some ssds allow choosing a 4KB sector, often a more performant choice. - Allows flashache to do better batching of metadata updates, potentially reducing metadata updates, small ssd writes, reducing write amplification and higher ssd lifetimes. Thanks due to Earle Philhower of Virident for this feature ! FlashCache Sysctls : ================== Flashcache sysctls operate on a per-cache device basis. A couple of examples first. Sysctls for a writearound or writethrough mode cache : cache device /dev/ram3, disk device /dev/ram4 dev.flashcache.ram3+ram4.cache_all = 1 dev.flashcache.ram3+ram4.zero_stats = 0 dev.flashcache.ram3+ram4.reclaim_policy = 0 dev.flashcache.ram3+ram4.pid_expiry_secs = 60 dev.flashcache.ram3+ram4.max_pids = 100 dev.flashcache.ram3+ram4.do_pid_expiry = 0 dev.flashcache.ram3+ram4.io_latency_hist = 0 dev.flashcache.ram3+ram4.skip_seq_thresh_kb = 0 Sysctls for a writeback mode cache : cache device /dev/sdb, disk device /dev/cciss/c0d2 dev.flashcache.sdb+c0d2.fallow_delay = 900 dev.flashcache.sdb+c0d2.fallow_clean_speed = 2 dev.flashcache.sdb+c0d2.cache_all = 1 dev.flashcache.sdb+c0d2.fast_remove = 0 dev.flashcache.sdb+c0d2.zero_stats = 0 dev.flashcache.sdb+c0d2.reclaim_policy = 0 dev.flashcache.sdb+c0d2.pid_expiry_secs = 60 dev.flashcache.sdb+c0d2.max_pids = 100 dev.flashcache.sdb+c0d2.do_pid_expiry = 0 dev.flashcache.sdb+c0d2.max_clean_ios_set = 2 dev.flashcache.sdb+c0d2.max_clean_ios_total = 4 dev.flashcache.sdb+c0d2.dirty_thresh_pct = 20 dev.flashcache.sdb+c0d2.stop_sync = 0 dev.flashcache.sdb+c0d2.do_sync = 0 dev.flashcache.sdb+c0d2.io_latency_hist = 0 dev.flashcache.sdb+c0d2.skip_seq_thresh_kb = 0 Sysctls common to all cache modes : dev.flashcache..cache_all: Global caching mode to cache everything or cache nothing. See section on Caching Controls. Defaults to "cache everything". dev.flashcache..zero_stats: Zero stats (once). dev.flashcache..reclaim_policy: FIFO (0) vs LRU (1). Defaults to FIFO. Can be switched at runtime. dev.flashcache..io_latency_hist: Compute IO latencies and plot these out on a histogram. The scale is 250 usecs. This is disabled by default since internally flashcache uses gettimeofday() to compute latency and this can get expensive depending on the clocksource used. Setting this to 1 enables computation of IO latencies. The IO latency histogram is appended to 'dmsetup status'. (There is little reason to tune these) dev.flashcache..max_pids: Maximum number of pids in the white/black lists. dev.flashcache..do_pid_expiry: Enable expiry on the list of pids in the white/black lists. dev.flashcache..pid_expiry_secs: Set the expiry on the pid white/black lists. dev.flashcache..skip_seq_thresh_kb: Skip (don't cache) sequential IO larger than this number (in kb). 0 (default) means cache all IO, both sequential and random. Sequential IO can only be determined 'after the fact', so this much of each sequential I/O will be cached before we skip the rest. Does not affect searching for IO in an existing cache. Sysctls for writeback mode only : dev.flashcache..fallow_delay = 900 In seconds. Clean dirty blocks that have been "idle" (not read or written) for fallow_delay seconds. Default is 15 minutes. Setting this to 0 disables idle cleaning completely. dev.flashcache..fallow_clean_speed = 2 The maximum number of "fallow clean" disk writes per set per second. Defaults to 2. dev.flashcache..fast_remove = 0 Don't sync dirty blocks when removing cache. On a reload both DIRTY and CLEAN blocks persist in the cache. This option can be used to do a quick cache remove. CAUTION: The cache still has uncommitted (to disk) dirty blocks after a fast_remove. dev.flashcache..dirty_thresh_pct = 20 Flashcache will attempt to keep the dirty blocks in each set under this %. A lower dirty threshold increases disk writes, and reduces block overwrites, but increases the blocks available for read caching. dev.flashcache..stop_sync = 0 Stop the sync in progress. dev.flashcache..do_sync = 0 Schedule cleaning of all dirty blocks in the cache. (There is little reason to tune these) dev.flashcache..max_clean_ios_set = 2 Maximum writes that can be issues per set when cleaning blocks. dev.flashcache..max_clean_ios_total = 4 Maximum writes that can be issued when syncing all blocks. Using dmsetup to create and load flashcache volumes : =================================================== Few users will need to use dmsetup natively to create and load flashcache volumes. This section covers that. dmsetup create device_name table_file where device_name: name of the flashcache device being created or loaded. table_file : other cache args (format below). If this is omitted, dmsetup attempts to read this from stdin. table_file format : 0 flashcache [size of cache in sectors] [cache set size] cache mode: 1: Write Back 2: Write Through 3: Write Around flashcache cmd: 1: load existing cache 2: create cache 3: force create cache (overwriting existing cache). USE WITH CAUTION blksize in sectors: 4KB (8 sectors, PAGE_SIZE) is the right choice for most applications. See note on block size selection below. Unused (can be omitted) for cache loads. size of cache in sectors: Optional. if size is not specified, the entire ssd device is used as cache. Needs to be a power of 2. Unused (can be omitted) for cache loads. cache set size: Optional. The default set size is 512, which works well for most applications. Little reason to change this. Needs to be a power of 2. Unused (can be omitted) for cache loads. Example : echo 0 `blockdev --getsize /dev/cciss/c0d1p2` flashcache /dev/cciss/c0d1p2 /dev/fioa2 cachedev 1 2 8 522000000 | dmsetup create cachedev This creates a writeback cache device called "cachedev" (/dev/mapper/cachedev) with a 4KB blocksize to cache /dev/cciss/c0d1p2 on /dev/fioa2. The size of the cache is 522000000 sectors. (TODO : Change loading of the cache happen via "dmsetup load" instead of "dmsetup create"). Caching Controls ================ Flashcache can be put in one of 2 modes - Cache Everything or Cache Nothing (dev.flashcache.cache_all). The defaults is to "cache everything". These 2 modes have a blacklist and a whitelist. The tgid (thread group id) for a group of pthreads can be used as a shorthand to tag all threads in an application. The tgid for a pthread is returned by getpid() and the pid of the individual thread is returned by gettid(). The algorithm works as follows : In "cache everything" mode, 1) If the pid of the process issuing the IO is in the blacklist, do not cache the IO. ELSE, 2) If the tgid is in the blacklist, don't cache this IO. UNLESS 3) The particular pid is marked as an exception (and entered in the whitelist, which makes the IO cacheable). 4) Finally, even if IO is cacheable up to this point, skip sequential IO if configured by the sysctl. Conversely, in "cache nothing" mode, 1) If the pid of the process issuing the IO is in the whitelist, cache the IO. ELSE, 2) If the tgid is in the whitelist, cache this IO. UNLESS 3) The particular pid is marked as an exception (and entered in the blacklist, which makes the IO non-cacheable). 4) Anything whitelisted is cached, regardless of sequential or random IO. Examples : -------- 1) You can make the global cache setting "cache nothing", and add the tgid of your pthreaded application to the whitelist. Which makes only IOs issued by your application cacheable by Flashcache. 2) You can make the global cache setting "cache everything" and add tgids (or pids) of other applications that may issue IOs on this volume to the blacklist, which will make those un-interesting IOs not cacheable. Note that this only works for O_DIRECT IOs. For buffered IOs, pdflush, kswapd would also do the writes, with flashcache caching those. The following cacheability ioctls are supported on /dev/mapper/ FLASHCACHEADDBLACKLIST: add the pid (or tgid) to the blacklist. FLASHCACHEDELBLACKLIST: Remove the pid (or tgid) from the blacklist. FLASHCACHEDELALLBLACKLIST: Clear the blacklist. This can be used to cleanup if a process dies. FLASHCACHEADDWHITELIST: add the pid (or tgid) to the whitelist. FLASHCACHEDELWHITELIST: Remove the pid (or tgid) from the whitelist. FLASHCACHEDELALLWHITELIST: Clear the whitelist. This can be used to cleanup if a process dies. /proc/flashcache_pidlists shows the list of pids on the whitelist and the blacklist. Security Note : ============= With Flashcache, it is possible for a malicious user process to corrupt data in files with only read access. In a future revision of flashcache, this will be addressed (with an extra data copy). Not documenting the mechanics of how a malicious process could corrupt data here. You can work around this by setting file permissions on files in the flashcache volume appropriately. Why is my cache only (<< 100%) utilized ? ======================================= (Answer contributed by Will Smith) - There is essentially a 1:many mapping between SSD blocks and HDD blocks. - In more detail, a HDD block gets hashed to a set on SSD which contains by default 512 blocks. It can only be stored in that set on SSD, nowhere else. So with a simplified SSD containing only 3 sets: SSD = 1 2 3 , and a HDD with 9 sets worth of data, the HDD sets would map to the SSD sets like this: HDD: 1 2 3 4 5 6 7 8 9 SSD: 1 2 3 1 2 3 1 2 3 So if your data only happens to live in HDD sets 1 and 4, they will compete for SSD set 1 and your SSD will at most become 33% utilized. If you use XFS you can tune the XFS agsize/agcount to try and mitigate this (described next section). Tuning XFS for better flashcache performance : ============================================ If you run XFS/Flashcache, it is worth tuning XFS' allocation group parameters (agsize/agcount) to achieve better flashcache performance. XFS allocates blocks for files in a given directory in a new allocation group. By tuning agsize and agcount (mkfs.xfs parameters), we can achieve much better distribution of blocks across flashcache. Better distribution of blocks across flashcache will decrease collisions on flashcache sets considerably, increase cache hit rates significantly and result in lower IO latencies. We can achieve this by computing agsize (and implicitly agcount) using these equations, C = Cache size, V = Size of filesystem Volume. agsize % C = (1/agcount)*C agsize * agcount ~= V where agsize <= 1000g (XFS limits on agsize). A couple of examples that illustrate the formula, For agcount = 4, let's divide up the cache into 4 equal parts (each part is size C/agcount). Let's call the parts C1, C2, C3, C4. One ideal way to map the allocation groups onto the cache is as follows. Ag1 Ag2 Ag3 Ag4 -- -- -- -- C1 C2 C3 C4 (stripe 1) C2 C3 C4 C1 (stripe 2) C3 C4 C1 C2 (stripe 3) C4 C1 C2 C3 (stripe 4) C1 C2 C3 C4 (stripe 5) In this simple example, note that each "stripe" has 2 properties 1) Each element of the stripe is a unique part of the cache. 2) The union of all the parts for a stripe gives us the entire cache. Clearly, this is an ideal mapping, from a distribution across the cache point of view. Another example, this time with agcount = 5, the cache is divided into 5 equal parts C1, .. C5. Ag1 Ag2 Ag3 Ag4 Ag5 -- -- -- -- -- C1 C2 C3 C4 C5 (stripe 1) C2 C3 C4 C5 C1 (stripe 2) C3 C4 C5 C1 C2 (stripe 3) C4 C5 C1 C2 C3 (stripe 4) C5 C1 C2 C3 C4 (stripe 5) C1 C2 C3 C4 C5 (stripe 6) A couple of examples that compute the optimal agsize for a given Cachesize and Filesystem volume size. a) C = 600g, V = 3,5TB Consider agcount = 5 agsize % 600 = (1/5)*600 agsize % 600 = 120 So an agsize of 720g would work well, and 720*5 = 3.6TB (~ 3.5TB) b) C = 150g, V = 3.5TB Consider agcount=4 agsize % 150 = (1/4)*150 agsize % 150 = 37.5 So an agsize of 937g would work well, and 937*4 = 3.7TB (~ 3.5TB) As an alternative, agsize % C = (1 - (1/agcount))*C agsize * agcount ~= V Works just as well as the formula above. This computation has been implemented in the utils/get_agsize utility. Tuning Sequential IO Skipping for better flashcache performance =============================================================== Skipping sequential IO makes sense in two cases: 1) your sequential write speed of your SSD is slower than the sequential write speed or read speed of your disk. In particular, for implementations with RAID disks (especially modes 0, 10 or 5) sequential reads may be very fast. If 'cache_all' mode is used, every disk read miss must also be written to SSD. If you notice slower sequential reads and writes after enabling flashcache, this is likely your problem. 2) Your 'resident set' of disk blocks that you want cached, i.e. those that you would hope to keep in cache, is smaller than the size of your SSD. You can check this by monitoring how quick your cache fills up ('dmsetup table'). If this is the case, it makes sense to prioritize caching of random IO, since SSD performance vastly exceeds disk performance for random IO, but is typically not much better for sequential IO. In the above cases, start with a high value (say 1024k) for sysctl dev.flashcache..skip_seq_thresh_kb, so only the largest sequential IOs are skipped, and gradually reduce if benchmarks show it's helping. Don't leave it set to a very high value, return it to 0 (the default), since there is some overhead in categorizing IO as random or sequential. If neither of the above hold, continue to cache all IO, (the default) you will likely benefit from it. Further Information =================== Git repository : https://github.com/facebook/flashcache/ Developer mailing list : http://groups.google.com/group/flashcache-dev/ flashcache-3.1.3+git20150701/flashcache-wt/000077500000000000000000000000001254507146700177225ustar00rootroot00000000000000flashcache-3.1.3+git20150701/flashcache-wt/Makefile000066400000000000000000000003231254507146700213600ustar00rootroot00000000000000all: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(PWD)/src install: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(PWD)/src install clean: $(MAKE) -C src KERNEL_TREE=$(KERNEL_TREE) PWD=$(PWD)/src clean flashcache-3.1.3+git20150701/flashcache-wt/README000066400000000000000000000030671254507146700206100ustar00rootroot00000000000000flashcache-wt is a simple, non-persistent write-through and write-around flashcache. It is a separate code base from flashcache. Note that flashcache itself, which is more configurable, now has options for writeback, writethrough and writearound caching. Notes : ----- 1) flashcache-wt is non persistent, which means that on a cache remove (or a reboot), you will lose the cache entirely. Since the cache is write-through/write-around, this will not result in any data loss. 2) Built on 2.6.18, .20, .27-.32 successfully. Tested on .18, .20, .27 and .32 successfully. Building flashcache-wt : ---------------------- opsdev209.snc1.facebook.com> make KERNEL_TREE= Creating a flashcache-wt volume : ------------------------------- flashcache_wt_create : Create a new flashcache-wt volume. flashcache_wt_create [-r] [-s cache size] [-b block size] cachedevname ssd_devname disk_devname Very similar to flashcache_create. Note : The default is to create the cache write-through. Use the -r option to create the cache write-around. Removing a flashcache-wt volume : ---------------------------- Use dmsetup remove to remove a flashcache-wt volume. Example: dmsetup remove cachedev Cache Stats : =========== Use 'dmsetup status' for cache statistics. 'dmsetup table' also dumps a number of cache related statistics. Examples : dmsetup status cachedev dmsetup table cachedev Cache Blocksize Selection ------------------------- 4KB cache blocks are suitable for the vast majority of the cases. Also see the flashcache-sa-guide for more discussion on this. flashcache-3.1.3+git20150701/flashcache-wt/src/000077500000000000000000000000001254507146700205115ustar00rootroot00000000000000flashcache-3.1.3+git20150701/flashcache-wt/src/Makefile000066400000000000000000000013511254507146700221510ustar00rootroot00000000000000EXTRA_CFLAGS=-I$(KERNEL_TREE)/drivers/md -I./ EXTRA_CFLAGS += -I$(KERNEL_TREE)/include/ -I$(KERNEL_TREE)/include/linux UTILS_CFLAGS=-I./ obj-m += flashcache-wt.o flashcache-wt-objs := flashcache_wt.o KERNEL_SOURCE_VERSION ?= $(shell uname -r) all: make -C $(KERNEL_TREE) M=$(PWD) modules $(CC) $(UTILS_CFLAGS) -o utils/flashcache_wt_create utils/flashcache_wt_create.c install: all install -o root -g root -m 0755 -d /lib/modules/$(KERNEL_SOURCE_VERSION)/extra/flashcache/ install -o root -g root -m 0755 flashcache-wt.ko /lib/modules/$(KERNEL_SOURCE_VERSION)/extra/flashcache/ depmod -a install -o root -g root -m 0755 utils/flashcache_wt_create /sbin/ clean: make -C $(KERNEL_TREE) M=$(PWD) clean rm -f utils/flashcache_wt_create flashcache-3.1.3+git20150701/flashcache-wt/src/dkms.conf000066400000000000000000000003121254507146700223120ustar00rootroot00000000000000BUILT_MODULE_NAME=flashcache-wt DEST_MODULE_LOCATION=/kernel/drivers/block PACKAGE_NAME=flashcache-wt PACKAGE_VERSION=0.1 AUTOINSTALL=yes REMAKE_INITRD=yes MAKE[0]="KERNEL_TREE=$kernel_source_dir make" flashcache-3.1.3+git20150701/flashcache-wt/src/flashcache_wt.c000066400000000000000000001213451254507146700234560ustar00rootroot00000000000000/**************************************************************************** * flashcache_wt.c * FlashCache_wt: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@facebook.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,21) #include #include #endif #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache_wt.h" #define FLASHCACHE_WT_SW_VERSION "flashcache_wt-1.0" char *flashcache_wt_sw_version = FLASHCACHE_WT_SW_VERSION; static struct workqueue_struct *_kcached_wq; static struct work_struct _kcached_work; static void cache_read_miss(struct cache_c *dmc, struct bio* bio, int index); static void cache_write(struct cache_c *dmc, struct bio* bio); static int cache_invalidate_blocks(struct cache_c *dmc, struct bio *bio); static void flashcache_wt_uncached_io_callback(unsigned long error, void *context); static void flashcache_wt_start_uncached_io(struct cache_c *dmc, struct bio *bio); u_int64_t size_hist[33]; static struct kmem_cache *_job_cache; static mempool_t *_job_pool; static DEFINE_SPINLOCK(_job_lock); static LIST_HEAD(_complete_jobs); static LIST_HEAD(_io_jobs); #ifndef DM_MAPIO_SUBMITTED #define DM_MAPIO_SUBMITTED 0 #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) int dm_io_async_bvec(unsigned int num_regions, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) struct dm_io_region *where, #else struct io_region *where, #endif int rw, struct bio_vec *bvec, io_notify_fn fn, void *context) { struct kcached_job *job = (struct kcached_job *)context; struct cache_c *dmc = job->dmc; struct dm_io_request iorq; iorq.bi_rw = rw; iorq.mem.type = DM_IO_BVEC; iorq.mem.ptr.bvec = bvec; iorq.notify.fn = fn; iorq.notify.context = context; iorq.client = dmc->io_client; return dm_io(&iorq, num_regions, where, NULL); } #endif #ifdef FLASHCACHE_WT_CHECKSUMS static u_int64_t flashcache_wt_compute_checksum(struct bio *bio) { int i; u_int64_t sum = 0, *idx; int cnt; int kmap_type; void *kvaddr; if (in_interrupt()) kmap_type = KM_SOFTIRQ0; else kmap_type = KM_USER0; for (i = bio->bi_idx ; i < bio->bi_vcnt ; i++) { kvaddr = kmap_atomic(bio->bi_io_vec[i].bv_page, kmap_type); idx = (u_int64_t *) ((char *)kvaddr + bio->bi_io_vec[i].bv_offset); cnt = bio->bi_io_vec[i].bv_len; while (cnt > 0) { sum += *idx++; cnt -= sizeof(u_int64_t); } kunmap_atomic(kvaddr, kmap_type); } return sum; } static void flashcache_wt_store_checksum(struct kcached_job *job) { u_int64_t sum; unsigned long flags; sum = flashcache_wt_compute_checksum(job->bio); spin_lock_irqsave(&job->dmc->cache_spin_lock, flags); job->dmc->cache[job->index].checksum = sum; spin_unlock_irqrestore(&job->dmc->cache_spin_lock, flags); } static int flashcache_wt_validate_checksum(struct kcached_job *job) { u_int64_t sum; int retval; unsigned long flags; sum = flashcache_wt_compute_checksum(job->bio); spin_lock_irqsave(&job->dmc->cache_spin_lock, flags); if (job->dmc->cache[job->index].checksum == sum) { job->dmc->checksum_valid++; retval = 0; } else { job->dmc->checksum_invalid++; retval = 1; } spin_unlock_irqrestore(&job->dmc->cache_spin_lock, flags); return retval; } #else /* FLASHCACHE_WT_CHECKSUMS */ static void flashcache_wt_store_checksum(struct kcached_job *job) { } static int flashcache_wt_validate_checksum(struct kcached_job *job) { return 0; } #endif /* FLASHCACHE_WT_CHECKSUMS */ static int jobs_init(void) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) _job_cache = kmem_cache_create("kcached-jobs-wt", sizeof(struct kcached_job), __alignof__(struct kcached_job), 0, NULL, NULL); #else _job_cache = kmem_cache_create("kcached-jobs-wt", sizeof(struct kcached_job), __alignof__(struct kcached_job), 0, NULL); #endif if (!_job_cache) return -ENOMEM; _job_pool = mempool_create(FLASHCACHE_WT_MIN_JOBS, mempool_alloc_slab, mempool_free_slab, _job_cache); if (!_job_pool) { kmem_cache_destroy(_job_cache); return -ENOMEM; } return 0; } static void jobs_exit(void) { BUG_ON(!list_empty(&_complete_jobs)); BUG_ON(!list_empty(&_io_jobs)); mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); _job_pool = NULL; _job_cache = NULL; } /* * Functions to push and pop a job onto the head of a given job list. */ static inline struct kcached_job * pop(struct list_head *jobs) { struct kcached_job *job = NULL; unsigned long flags; spin_lock_irqsave(&_job_lock, flags); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct kcached_job, list); list_del(&job->list); } spin_unlock_irqrestore(&_job_lock, flags); return job; } static inline void push(struct list_head *jobs, struct kcached_job *job) { unsigned long flags; spin_lock_irqsave(&_job_lock, flags); list_add_tail(&job->list, jobs); spin_unlock_irqrestore(&_job_lock, flags); } /* * Note : io_callback happens from softirq() and you cannot kick off * new IOs from here. Unfortunately, we have to loop back the calls * to kick off new IOs to the workqueue. */ void flashcache_wt_io_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *) context; struct cache_c *dmc = job->dmc; struct bio *bio; unsigned long flags; int invalid = 0; VERIFY(job != NULL); bio = job->bio; VERIFY(bio != NULL); DPRINTK("flashcache_wt_io_callback: %s %llu(%llu->%llu,%llu)", (job->rw == READ ? "READ" : "WRITE"), bio->bi_sector, job->disk.sector, job->cache.sector, job->disk.count); if (error) DMERR("flashcache_wt_io_callback: io error %ld", error); if (job->rw == READSOURCE || job->rw == WRITESOURCE) { spin_lock_irqsave(&dmc->cache_spin_lock, flags); if (dmc->cache_state[job->index] != INPROG) { VERIFY(dmc->cache_state[job->index] == INPROG_INVALID); invalid++; } spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); if (error || invalid) { if (invalid) DMERR("flashcache_wt_io_callback: cache fill invalidation, sector %lu, size %u", bio->bi_sector, bio->bi_size); flashcache_bio_endio(bio, error); spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[job->index] = INVALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); goto out; } else { /* Kick off the write to the cache */ job->rw = WRITECACHE; push(&_io_jobs, job); queue_work(_kcached_wq, &_kcached_work); return; } } else if (job->rw == READCACHE) { spin_lock_irqsave(&dmc->cache_spin_lock, flags); VERIFY(dmc->cache_state[job->index] == INPROG_INVALID || dmc->cache_state[job->index] == CACHEREADINPROG); if (dmc->cache_state[job->index] == INPROG_INVALID) invalid++; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); if (!invalid && !error && (flashcache_wt_validate_checksum(job) == 0)) { /* Complete the current IO successfully */ flashcache_bio_endio(bio, 0); spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[job->index] = VALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); goto out; } /* error || invalid || bad checksum, bounce back to source device */ job->rw = READCACHE_DONE; push(&_complete_jobs, job); queue_work(_kcached_wq, &_kcached_work); return; } else { VERIFY(job->rw == WRITECACHE); flashcache_bio_endio(bio, 0); spin_lock_irqsave(&dmc->cache_spin_lock, flags); VERIFY((dmc->cache_state[job->index] == INPROG) || (dmc->cache_state[job->index] == INPROG_INVALID)); if (error || dmc->cache_state[job->index] == INPROG_INVALID) { dmc->cache_state[job->index] = INVALID; } else { dmc->cache_state[job->index] = VALID; dmc->cached_blocks++; } spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); DPRINTK_LITE("Cache Fill: Block %llu, index = %d: Cache state = %d", dmc->cache[job->index].dbn, job->index, dmc->cache_state[job->index]); } out: mempool_free(job, _job_pool); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } static int do_io(struct kcached_job *job) { int r = 0; struct cache_c *dmc = job->dmc; struct bio *bio = job->bio; VERIFY(job->rw == WRITECACHE); /* Write to cache device */ flashcache_wt_store_checksum(job); #ifdef FLASHCACHE_WT_CHECKSUMS dmc->checksum_store++; #endif /* FLASHCACHE_WT_CHECKSUMS */ dmc->cache_writes++; r = dm_io_async_bvec(1, &job->cache, WRITE, bio->bi_io_vec + bio->bi_idx, flashcache_wt_io_callback, job); VERIFY(r == 0); /* In our case, dm_io_async_bvec() must always return 0 */ return r; } int flashcache_wt_do_complete(struct kcached_job *job) { struct bio *bio = job->bio; struct cache_c *dmc = job->dmc; unsigned long flags; VERIFY(job->rw == READCACHE_DONE); DPRINTK("flashcache_wt_do_complete: %llu", bio->bi_sector); /* error || block invalidated while reading from cache || bad checksum */ spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[job->index] = INVALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); mempool_free(job, _job_pool); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); /* Kick this IO back to the source bdev */ flashcache_wt_start_uncached_io(dmc, bio); return 0; } static void process_jobs(struct list_head *jobs, int (*fn) (struct kcached_job *)) { struct kcached_job *job; while ((job = pop(jobs))) (void)fn(job); } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) static void do_work(void *unused) #else static void do_work(struct work_struct *work) #endif { process_jobs(&_complete_jobs, flashcache_wt_do_complete); process_jobs(&_io_jobs, do_io); } /* DM async IO mempool sizing */ #define FLASHCACHE_WT_ASYNC_SIZE 1024 static int kcached_init(struct cache_c *dmc) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) int r; r = dm_io_get(FLASHCACHE_ASYNC_SIZE); if (r) { DMERR("flashcache_kcached_init: Could not resize dm io pool"); return r; } #endif init_waitqueue_head(&dmc->destroyq); atomic_set(&dmc->nr_jobs, 0); return 0; } void kcached_client_destroy(struct cache_c *dmc) { /* Wait for completion of all jobs submitted by this client. */ wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs)); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) dm_io_put(FLASHCACHE_ASYNC_SIZE); #endif } /* * Map a block from the source device to a block in the cache device. */ static unsigned long hash_block(struct cache_c *dmc, sector_t dbn) { unsigned long set_number, value; value = (unsigned long) (dbn >> (dmc->block_shift + dmc->consecutive_shift)); set_number = value % (dmc->size >> dmc->consecutive_shift); DPRINTK("Hash: %llu(%lu)->%lu", dbn, value, set_number); return set_number; } static int find_valid_dbn(struct cache_c *dmc, sector_t dbn, int start_index, int *index) { int i; int end_index = start_index + dmc->assoc; for (i = start_index ; i < end_index ; i++) { if (dbn == dmc->cache[i].dbn && (dmc->cache_state[i] == VALID || dmc->cache_state[i] == CACHEREADINPROG || dmc->cache_state[i] == INPROG)) { *index = i; return dmc->cache_state[i]; } } return -1; } static void find_invalid_dbn(struct cache_c *dmc, int start_index, int *index) { int i; int end_index = start_index + dmc->assoc; /* Find INVALID slot that we can reuse */ for (i = start_index ; i < end_index ; i++) { if (dmc->cache_state[i] == INVALID) { *index = i; return; } } } static void find_reclaim_dbn(struct cache_c *dmc, int start_index, int *index) { int i; int end_index = start_index + dmc->assoc; int set = start_index / dmc->assoc; int slots_searched = 0; /* * Find the "oldest" VALID slot to recycle. * For each set, we keep track of the next "lru" * slot to pick off. Each time we pick off a VALID * entry to recycle we advance this pointer. So * we sweep through the set looking for next blocks * to recycle. This approximates to FIFO (modulo * for blocks written through). * XXX - Add LRU ala (wb) flashcache. */ i = dmc->set_lru_next[set]; while (slots_searched < dmc->assoc) { VERIFY(i >= start_index); VERIFY(i < end_index); if (dmc->cache_state[i] == VALID) { *index = i; break; } slots_searched++; i++; if (i == end_index) i = start_index; } i++; if (i == end_index) i = start_index; dmc->set_lru_next[set] = i; } /* * dbn is the starting sector, io_size is the number of sectors. */ static int cache_lookup(struct cache_c *dmc, struct bio *bio, int *index) { sector_t dbn = bio->bi_sector; #if DMC_DEBUG int io_size = to_sector(bio->bi_size); #endif unsigned long set_number = hash_block(dmc, dbn); int invalid = -1, oldest_clean = -1; int start_index; int ret; start_index = dmc->assoc * set_number; DPRINTK("Cache read lookup : dbn %llu(%lu), set = %d", dbn, io_size, set_number); ret = find_valid_dbn(dmc, dbn, start_index, index); if (ret == VALID || ret == INPROG || ret == CACHEREADINPROG) { DPRINTK_LITE("Cache read lookup: Block %llu(%lu): ret %d VALID/INPROG index %d", dbn, io_size, ret, *index); /* We found the exact range of blocks we are looking for */ return ret; } DPRINTK_LITE("Cache read lookup: Block %llu(%lu):%d INVALID", dbn, io_size, ret); VERIFY(ret == -1); find_invalid_dbn(dmc, start_index, &invalid); if (invalid == -1) { /* We didn't find an invalid entry, search for oldest valid entry */ find_reclaim_dbn(dmc, start_index, &oldest_clean); } /* * Cache miss : * We can't choose an entry marked INPROG, but choose the oldest * INVALID or the oldest VALID entry. */ *index = start_index + dmc->assoc; if (invalid != -1) { DPRINTK_LITE("Cache read lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d", dbn, io_size, set_number, invalid, start_index); *index = invalid; } else if (oldest_clean != -1) { DPRINTK_LITE("Cache read lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d", dbn, io_size, set_number, oldest_clean, start_index); *index = oldest_clean; } else { DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d", dbn, io_size, set_number); } if (*index < (start_index + dmc->assoc)) return INVALID; else return -1; } static struct kcached_job * new_kcached_job(struct cache_c *dmc, struct bio* bio, int index) { struct kcached_job *job; job = mempool_alloc(_job_pool, GFP_NOIO); if (job == NULL) return NULL; job->disk.bdev = dmc->disk_dev->bdev; job->disk.sector = bio->bi_sector; if (index != -1) job->disk.count = dmc->block_size; else job->disk.count = to_sector(bio->bi_size); job->cache.bdev = dmc->cache_dev->bdev; if (index != -1) { job->cache.sector = index << dmc->block_shift; job->cache.count = dmc->block_size; } job->dmc = dmc; job->bio = bio; job->index = index; job->error = 0; return job; } static void cache_read_miss(struct cache_c *dmc, struct bio* bio, int index) { struct kcached_job *job; unsigned long flags; DPRINTK("Cache Read Miss sector %llu %u bytes, index %d)", bio->bi_sector, bio->bi_size, index); job = new_kcached_job(dmc, bio, index); if (unlikely(job == NULL)) { /* XXX - need to bump up a stat here */ DMERR("cache_read_miss: Cannot allocate job\n"); spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[index] = INVALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); flashcache_bio_endio(bio, -EIO); } else { job->rw = READSOURCE; /* Fetch data from the source device */ DPRINTK("Queue job for %llu", bio->bi_sector); atomic_inc(&dmc->nr_jobs); dmc->disk_reads++; dm_io_async_bvec(1, &job->disk, READ, bio->bi_io_vec + bio->bi_idx, flashcache_wt_io_callback, job); } } static void cache_read(struct cache_c *dmc, struct bio *bio) { int index; int res; unsigned long flags; DPRINTK_LITE("Got a %s for %llu %u bytes)", (bio_rw(bio) == READ ? "READ":"READA"), bio->bi_sector, bio->bi_size); spin_lock_irqsave(&dmc->cache_spin_lock, flags); res = cache_lookup(dmc, bio, &index); /* Cache Hit */ if ((res == VALID) && (dmc->cache[index].dbn == bio->bi_sector)) { struct kcached_job *job; dmc->cache_state[index] = CACHEREADINPROG; dmc->cache_hits++; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); DPRINTK_LITE("Cache read: Block %llu(%lu), index = %d:%s", bio->bi_sector, bio->bi_size, index, "CACHE HIT"); job = new_kcached_job(dmc, bio, index); if (unlikely(job == NULL)) { /* * Can't allocate job, bounce back error * XXX - need up bump a stat here */ DMERR("cache_read(_hit): Cannot allocate job\n"); spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[index] = VALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); flashcache_bio_endio(bio, -EIO); } else { job->rw = READCACHE; /* Fetch data from the source device */ DPRINTK("Queue job for %llu", bio->bi_sector); atomic_inc(&dmc->nr_jobs); dmc->cache_reads++; dm_io_async_bvec(1, &job->cache, READ, bio->bi_io_vec + bio->bi_idx, flashcache_wt_io_callback, job); } return; } /* * In all cases except for a cache hit (and VALID), test for potential * invalidations that we need to do. */ if (cache_invalidate_blocks(dmc, bio) > 0) { /* A non zero return indicates an inprog invalidation */ spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); /* Start uncached IO */ flashcache_wt_start_uncached_io(dmc, bio); return; } if (res == -1 || res >= INPROG) { /* * We either didn't find a cache slot in the set we were looking * at or the block we are trying to read is being refilled into * cache. */ spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); DPRINTK_LITE("Cache read: Block %llu(%lu):%s", bio->bi_sector, bio->bi_size, "CACHE MISS & NO ROOM"); /* Start uncached IO */ flashcache_wt_start_uncached_io(dmc, bio); return; } /* * (res == INVALID) Cache Miss * And we found cache blocks to replace * Claim the cache blocks before giving up the spinlock */ if (dmc->cache_state[index] == VALID) { dmc->cached_blocks--; dmc->replace++; } dmc->cache_state[index] = INPROG; dmc->cache[index].dbn = bio->bi_sector; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); DPRINTK_LITE("Cache read: Block %llu(%lu), index = %d:%s", bio->bi_sector, bio->bi_size, index, "CACHE MISS & REPLACE"); cache_read_miss(dmc, bio, index); } static int cache_invalidate_block_set(struct cache_c *dmc, int set, sector_t io_start, sector_t io_end, int rw, int *inprog_inval) { int start_index, end_index, i; int invalidations = 0; start_index = dmc->assoc * set; end_index = start_index + dmc->assoc; for (i = start_index ; i < end_index ; i++) { sector_t start_dbn = dmc->cache[i].dbn; sector_t end_dbn = start_dbn + dmc->block_size; if (dmc->cache_state[i] == INVALID || dmc->cache_state[i] == INPROG_INVALID) continue; if ((io_start >= start_dbn && io_start < end_dbn) || (io_end >= start_dbn && io_end < end_dbn)) { /* We have a match */ if (rw == WRITE) dmc->wr_invalidates++; else dmc->rd_invalidates++; invalidations++; if (dmc->cache_state[i] == VALID) { dmc->cached_blocks--; dmc->cache_state[i] = INVALID; DPRINTK_LITE("Cache invalidate: Block %llu VALID", start_dbn); } else if (dmc->cache_state[i] >= INPROG) { (*inprog_inval)++; dmc->cache_state[i] = INPROG_INVALID; DMERR("cache_invalidate_block_set: sector %lu, size %lu, rw %d", io_start, io_end - io_start, rw); DPRINTK_LITE("Cache invalidate: Block %llu INPROG", start_dbn); } } } return invalidations; } /* * Since md will break up IO into blocksize pieces, we only really need to check * the start set and the end set for overlaps. */ static int cache_invalidate_blocks(struct cache_c *dmc, struct bio *bio) { sector_t io_start = bio->bi_sector; sector_t io_end = bio->bi_sector + (to_sector(bio->bi_size) - 1); int start_set, end_set; int inprog_inval_start = 0, inprog_inval_end = 0; start_set = hash_block(dmc, io_start); end_set = hash_block(dmc, io_end); (void)cache_invalidate_block_set(dmc, start_set, io_start, io_end, bio_data_dir(bio), &inprog_inval_start); if (start_set != end_set) cache_invalidate_block_set(dmc, end_set, io_start, io_end, bio_data_dir(bio), &inprog_inval_end); return (inprog_inval_start + inprog_inval_end); } static void cache_write(struct cache_c *dmc, struct bio* bio) { int index; int res; unsigned long flags; struct kcached_job *job; spin_lock_irqsave(&dmc->cache_spin_lock, flags); if (cache_invalidate_blocks(dmc, bio) > 0) { /* A non zero return indicates an inprog invalidation */ spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); /* Start uncached IO */ flashcache_wt_start_uncached_io(dmc, bio); return; } res = cache_lookup(dmc, bio, &index); VERIFY(res == -1 || res == INVALID); if (res == -1) { spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); /* Start uncached IO */ flashcache_wt_start_uncached_io(dmc, bio); return; } if (dmc->cache_state[index] == VALID) { dmc->cached_blocks--; dmc->cache_wr_replace++; } dmc->cache_state[index] = INPROG; dmc->cache[index].dbn = bio->bi_sector; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); job = new_kcached_job(dmc, bio, index); if (unlikely(job == NULL)) { /* XXX - need to bump up a stat here */ DMERR("cache_write: Cannot allocate job\n"); spin_lock_irqsave(&dmc->cache_spin_lock, flags); dmc->cache_state[index] = INVALID; spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); flashcache_bio_endio(bio, -EIO); return; } job->rw = WRITESOURCE; /* Write data to the source device */ DPRINTK("Queue job for %llu", bio->bi_sector); atomic_inc(&job->dmc->nr_jobs); dmc->disk_writes++; dm_io_async_bvec(1, &job->disk, WRITE, bio->bi_io_vec + bio->bi_idx, flashcache_wt_io_callback, job); return; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) #else #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) #define bio_barrier(bio) ((bio)->bi_rw & REQ_HARDBARRIER) #else #define bio_barrier(bio) ((bio)->bi_rw & REQ_FLUSH) #endif #endif #endif /* * Decide the mapping and perform necessary cache operations for a bio request. */ int flashcache_wt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct cache_c *dmc = (struct cache_c *) ti->private; unsigned long flags; int sectors = to_sector(bio->bi_size); if (sectors <= 32) size_hist[sectors]++; DPRINTK("Got a %s for %llu %u bytes)", bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ? "READ":"READA"), bio->bi_sector, bio->bi_size); if (bio_barrier(bio)) return -EOPNOTSUPP; VERIFY(to_sector(bio->bi_size) <= dmc->block_size); if (bio_data_dir(bio) == READ) dmc->reads++; else dmc->writes++; if (to_sector(bio->bi_size) != dmc->block_size || (dmc->write_around_mode && (bio_data_dir(bio) == WRITE))) { spin_lock_irqsave(&dmc->cache_spin_lock, flags); (void)cache_invalidate_blocks(dmc, bio); spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); /* Start uncached IO */ flashcache_wt_start_uncached_io(dmc, bio); } else { if (bio_data_dir(bio) == READ) cache_read(dmc, bio); else cache_write(dmc, bio); } return DM_MAPIO_SUBMITTED; } static void flashcache_wt_uncached_io_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *) context; struct cache_c *dmc = job->dmc; unsigned long flags; spin_lock_irqsave(&dmc->cache_spin_lock, flags); if (bio_data_dir(job->bio) == READ) dmc->uncached_reads++; else dmc->uncached_writes++; (void)cache_invalidate_blocks(dmc, job->bio); spin_unlock_irqrestore(&dmc->cache_spin_lock, flags); flashcache_bio_endio(job->bio, error); mempool_free(job, _job_pool); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } static void flashcache_wt_start_uncached_io(struct cache_c *dmc, struct bio *bio) { int is_write = (bio_data_dir(bio) == WRITE); struct kcached_job *job; job = new_kcached_job(dmc, bio, -1); if (unlikely(job == NULL)) { flashcache_bio_endio(bio, -EIO); return; } atomic_inc(&dmc->nr_jobs); if (bio_data_dir(job->bio) == READ) dmc->disk_reads++; else dmc->disk_writes++; dm_io_async_bvec(1, &job->disk, ((is_write) ? WRITE : READ), bio->bi_io_vec + bio->bi_idx, flashcache_wt_uncached_io_callback, job); } static int inline flashcache_wt_get_dev(struct dm_target *ti, char *pth, struct dm_dev **dmd, char *dmc_dname, sector_t tilen) { int rc; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) rc = dm_get_device(ti, pth, dm_table_get_mode(ti->table), dmd); #else #if defined(RHEL_MAJOR) && RHEL_MAJOR == 6 rc = dm_get_device(ti, pth, dm_table_get_mode(ti->table), dmd); #else rc = dm_get_device(ti, pth, 0, tilen, dm_table_get_mode(ti->table), dmd); #endif #endif if (!rc) strncpy(dmc_dname, pth, DEV_PATHLEN); return rc; } /* * Construct a cache mapping. * arg[0]: path to source device * arg[1]: path to cache device * arg[2]: cache persistence (if set, cache conf is loaded from disk) * Cache configuration parameters (if not set, default values are used. * arg[3]: cache block size (in sectors) * arg[4]: cache size (in blocks) * arg[5]: cache associativity */ static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct cache_c *dmc; unsigned int consecutive_blocks; sector_t i, order; sector_t data_size, dev_size; int r = -EINVAL; if (argc < 2) { ti->error = "flashcache-wt: Need at least 2 arguments"; goto bad; } dmc = kzalloc(sizeof(*dmc), GFP_KERNEL); if (dmc == NULL) { ti->error = "flashcache-wt: Failed to allocate cache context"; r = ENOMEM; goto bad; } dmc->tgt = ti; if (flashcache_wt_get_dev(ti, argv[0], &dmc->disk_dev, dmc->disk_devname, ti->len)) { ti->error = "flashcache-wt: Disk device lookup failed"; goto bad1; } if (flashcache_wt_get_dev(ti, argv[1], &dmc->cache_dev, dmc->cache_devname, 0)) { ti->error = "flashcache-wt: Cache device lookup failed"; goto bad2; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) dmc->io_client = dm_io_client_create(FLASHCACHE_COPY_PAGES); if (IS_ERR(dmc->io_client)) { r = PTR_ERR(dmc->io_client); ti->error = "Failed to create io client\n"; goto bad2; } #endif r = kcached_init(dmc); if (r) { ti->error = "Failed to initialize kcached"; goto bad3; } if (argc >= 3) { if (sscanf(argv[2], "%d", &dmc->write_around_mode) != 1) { ti->error = "flashcache-wt: Invalid mode"; r = -EINVAL; goto bad4; } } else dmc->assoc = DEFAULT_CACHE_ASSOC; if (argc >= 4) { if (sscanf(argv[3], "%u", &dmc->block_size) != 1) { ti->error = "flashcache-wt: Invalid block size"; r = -EINVAL; goto bad4; } if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) { ti->error = "flashcache-wt: Invalid block size"; r = -EINVAL; goto bad4; } } else dmc->block_size = DEFAULT_BLOCK_SIZE; dmc->block_shift = ffs(dmc->block_size) - 1; dmc->block_mask = dmc->block_size - 1; /* dmc->size is specified in sectors here, and converted to blocks below */ if (argc >= 5) { if (sscanf(argv[4], "%lu", &dmc->size) != 1) { ti->error = "flashcache-wt: Invalid cache size"; r = -EINVAL; goto bad4; } } else { dmc->size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); } if (argc >= 6) { if (sscanf(argv[5], "%u", &dmc->assoc) != 1) { ti->error = "flashcache-wt: Invalid cache associativity"; r = -EINVAL; goto bad4; } if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) || dmc->size < dmc->assoc) { ti->error = "flashcache-wt: Invalid cache associativity"; r = -EINVAL; goto bad4; } } else dmc->assoc = DEFAULT_CACHE_ASSOC; /* * Convert size (in sectors) to blocks. * Then round size (in blocks now) down to a multiple of associativity */ dmc->size /= dmc->block_size; dmc->size = (dmc->size / dmc->assoc) * dmc->assoc; dev_size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); data_size = dmc->size * dmc->block_size; if (data_size > dev_size) { DMERR("Requested cache size exeeds the cache device's capacity" \ "(%lu>%lu)", data_size, dev_size); ti->error = "flashcache-wt: Invalid cache size"; r = -EINVAL; goto bad4; } consecutive_blocks = dmc->assoc; dmc->consecutive_shift = ffs(consecutive_blocks) - 1; order = dmc->size * sizeof(struct cacheblock); DMINFO("Allocate %luKB (%ldB per) mem for %lu-entry cache" \ "(capacity:%luMB, associativity:%u, block size:%u " \ "sectors(%uKB))", order >> 10, sizeof(struct cacheblock), dmc->size, data_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT)); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { ti->error = "Unable to allocate memory"; r = -ENOMEM; goto bad4; } dmc->cache_state = (u_int8_t *)vmalloc(dmc->size); if (!dmc->cache_state) { ti->error = "Unable to allocate memory"; r = -ENOMEM; vfree((void *)dmc->cache); goto bad4; } order = (dmc->size >> dmc->consecutive_shift) * sizeof(u_int32_t); dmc->set_lru_next = (u_int32_t *)vmalloc(order); if (!dmc->set_lru_next) { ti->error = "Unable to allocate memory"; r = -ENOMEM; vfree((void *)dmc->cache); vfree((void *)dmc->cache_state); goto bad4; } /* Initialize the cache structs */ for (i = 0; i < dmc->size ; i++) { dmc->cache[i].dbn = 0; #ifdef FLASHCACHE_WT_CHECKSUMS dmc->cache[i].checksum = 0; #endif /* FLASHCACHE_WT_CHECKSUMS */ dmc->cache_state[i] = INVALID; } /* Initialize the point where LRU sweeps begin for each set */ for (i = 0 ; i < (dmc->size >> dmc->consecutive_shift) ; i++) dmc->set_lru_next[i] = i * dmc->assoc; spin_lock_init(&dmc->cache_spin_lock); dmc->reads = 0; dmc->writes = 0; dmc->cache_hits = 0; dmc->replace = 0; dmc->wr_invalidates = 0; dmc->rd_invalidates = 0; dmc->cached_blocks = 0; dmc->cache_wr_replace = 0; #ifdef FLASHCACHE_WT_CHECKSUMS dmc->checksum_store = 0; dmc->checksum_valid = 0; dmc->checksum_invalid = 0; #endif /* FLASHCACHE_WT_CHECKSUMS */ ti->split_io = dmc->block_size; ti->private = dmc; return 0; bad4: kcached_client_destroy(dmc); bad3: #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) dm_io_client_destroy(dmc->io_client); #endif dm_put_device(ti, dmc->cache_dev); bad2: dm_put_device(ti, dmc->disk_dev); bad1: kfree(dmc); bad: return r; } /* * Destroy the cache mapping. */ static void cache_dtr(struct dm_target *ti) { struct cache_c *dmc = (struct cache_c *) ti->private; kcached_client_destroy(dmc); if (dmc->reads + dmc->writes > 0) { int read_hit_pct; int cache_pct; if (dmc->reads > 0) read_hit_pct = dmc->cache_hits * 100 / dmc->reads; else read_hit_pct = 0; DMINFO("stats: \n\treads(%lu), writes(%lu)\n", dmc->reads, dmc->writes); #ifdef FLASHCACHE_WT_CHECKSUMS DMINFO("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\tread invalidates(%lu), write invalidates(%lu)\n" \ "\tchecksum store (%lu), checksum valid (%lu), checksum invalid(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->cache_wr_replace, dmc->rd_invalidates, dmc->wr_invalidates, dmc->checksum_store, dmc->checksum_valid, dmc->checksum_invalid); #else DMINFO("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\tread invalidates(%lu), write invalidates(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->cache_wr_replace, dmc->rd_invalidates, dmc->wr_invalidates); #endif if (dmc->size > 0) cache_pct = (dmc->cached_blocks * 100) / dmc->size; else cache_pct = 0; DMINFO("conf:\n"\ "\tcapacity(%luM), associativity(%u), block size(%uK)\n" \ "\ttotal blocks(%lu), cached blocks(%lu), cache percent(%d)\n", dmc->size*dmc->block_size>>11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), dmc->size, dmc->cached_blocks, cache_pct); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) dm_io_client_destroy(dmc->io_client); #endif vfree((void *)dmc->cache); vfree((void *)dmc->cache_state); vfree((void *)dmc->set_lru_next); dm_put_device(ti, dmc->disk_dev); dm_put_device(ti, dmc->cache_dev); kfree(dmc); } static void flashcache_wt_status_info(struct cache_c *dmc, status_type_t type, char *result, unsigned int maxlen) { int read_hit_pct; int sz = 0; /* DMEMIT */ if (dmc->reads > 0) read_hit_pct = dmc->cache_hits * 100 / dmc->reads; else read_hit_pct = 0; DMEMIT("stats: \n\treads(%lu), writes(%lu)\n", dmc->reads, dmc->writes); #ifdef FLASHCACHE_WT_CHECKSUMS if (dmc->write_around_mode == 0) { DMEMIT("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\tread invalidates(%lu), write invalidates(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu)\n" \ "\tdisk reads(%lu), disk writes(%lu)\n" \ "\tcache reads(%lu), cache writes(%lu)\n" \ "\tchecksum store (%lu), checksum valid (%lu), checksum invalid(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->cache_wr_replace, dmc->rd_invalidates, dmc->wr_invalidates, dmc->uncached_reads, dmc->uncached_writes, dmc->disk_reads, dmc->disk_writes, dmc->cache_reads, dmc->cache_writes, dmc->checksum_store, dmc->checksum_valid, dmc->checksum_invalid); } else { DMEMIT("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), read invalidates(%lu) write invalidates(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu)\n" \ "\tdisk reads(%lu), disk writes(%lu)\n" \ "\tcache reads(%lu), cache writes(%lu)\n" \ "\tchecksum store (%lu), checksum valid (%lu), checksum invalid(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->rd_invalidates, dmc->wr_invalidates, dmc->uncached_reads, dmc->uncached_writes, dmc->disk_reads, dmc->disk_writes, dmc->cache_reads, dmc->cache_writes, dmc->checksum_store, dmc->checksum_valid, dmc->checksum_invalid); } #else /* FLASHCACHE_WT_CHECKSUMS */ if (dmc->write_around_mode == 0) { DMEMIT("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\tread invalidates(%lu), write invalidates(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu)\n" \ "\tdisk reads(%lu), disk writes(%lu)\n" \ "\tcache reads(%lu), cache writes(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->cache_wr_replace, dmc->rd_invalidates, dmc->wr_invalidates, dmc->uncached_reads, dmc->uncached_writes, dmc->disk_reads, dmc->disk_writes, dmc->cache_reads, dmc->cache_writes); } else { DMEMIT("\tcache hits(%lu), cache hit percent (%d)\n" \ "\treplacement(%lu), read invalidates(%lu) write invalidates(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu)\n" \ "\tdisk reads(%lu), disk writes(%lu)\n" \ "\tcache reads(%lu), cache writes(%lu)\n", dmc->cache_hits, read_hit_pct, dmc->replace, dmc->rd_invalidates, dmc->wr_invalidates, dmc->uncached_reads, dmc->uncached_writes, dmc->disk_reads, dmc->disk_writes, dmc->cache_reads, dmc->cache_writes); } #endif /* FLASHCACHE_WT_CHECKSUMS */ } static void flashcache_wt_status_table(struct cache_c *dmc, status_type_t type, char *result, unsigned int maxlen) { int cache_pct; int i; int sz = 0; /* DMEMIT */ if (dmc->size > 0) cache_pct = (dmc->cached_blocks * 100) / dmc->size; else cache_pct = 0; DMEMIT("conf:\n"\ "\tssd dev (%s), disk dev (%s) mode (%s)\n" \ "\tcapacity(%luM), associativity(%u), block size(%uK)\n" \ "\ttotal blocks(%lu), cached blocks(%lu), cache percent(%d)\n", dmc->cache_devname, dmc->disk_devname, ((dmc->write_around_mode) ? "WRITE_AROUND" : "WRITETHROUGH"), dmc->size*dmc->block_size>>11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), dmc->size, dmc->cached_blocks, cache_pct); DMEMIT(" Size Hist: "); for (i = 1 ; i <= 32 ; i++) { if (size_hist[i] > 0) DMEMIT("%d:%llu ", i*512, size_hist[i]); } } /* * Report cache status: * Output cache stats upon request of device status; * Output cache configuration upon request of table status. */ static int cache_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct cache_c *dmc = (struct cache_c *) ti->private; switch (type) { case STATUSTYPE_INFO: flashcache_wt_status_info(dmc, type, result, maxlen); break; case STATUSTYPE_TABLE: flashcache_wt_status_table(dmc, type, result, maxlen); break; } return 0; } /**************************************************************************** * Functions for manipulating a cache target. ****************************************************************************/ static struct target_type cache_target = { .name = "flashcache-wt", .version= {1, 0, 1}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, .map = flashcache_wt_map, .status = cache_status, }; static int flashcache_wt_version_show(struct seq_file *seq, void *v) { seq_printf(seq, "Flashcache_wt Version : %s\n", flashcache_wt_sw_version); return 0; } static int flashcache_wt_version_open(struct inode *inode, struct file *file) { return single_open(file, &flashcache_wt_version_show, NULL); } static struct file_operations flashcache_wt_version_operations = { .open = flashcache_wt_version_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /* * Initiate a cache target. */ int __init flashcache_wt_init(void) { int r; r = jobs_init(); if (r) return r; _kcached_wq = create_singlethread_workqueue("kcached"); if (!_kcached_wq) { DMERR("failed to start kcached"); return -ENOMEM; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) INIT_WORK(&_kcached_work, do_work, NULL); #else INIT_WORK(&_kcached_work, do_work); #endif for (r = 0 ; r < 33 ; r++) size_hist[r] = 0; r = dm_register_target(&cache_target); if (r < 0) { DMERR("cache: register failed %d", r); } printk("flashcache-wt: %s initialized\n", flashcache_wt_sw_version); #ifdef CONFIG_PROC_FS { struct proc_dir_entry *entry; entry = create_proc_entry("flashcache_wt_version", 0, NULL); if (entry) entry->proc_fops = &flashcache_wt_version_operations; } #endif return r; } /* * Destroy a cache target. */ void flashcache_wt_exit(void) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) int r = dm_unregister_target(&cache_target); if (r < 0) DMERR("cache: unregister failed %d", r); #else dm_unregister_target(&cache_target); #endif jobs_exit(); destroy_workqueue(_kcached_wq); #ifdef CONFIG_PROC_FS remove_proc_entry("flashcache_wt_version", NULL); #endif } module_init(flashcache_wt_init); module_exit(flashcache_wt_exit); EXPORT_SYMBOL(flashcache_wt_io_callback); EXPORT_SYMBOL(flashcache_wt_do_complete); EXPORT_SYMBOL(flashcache_wt_map); MODULE_DESCRIPTION(DM_NAME " Facebook Flashcache DM target"); MODULE_AUTHOR("Mohan - based on code by Ming"); MODULE_LICENSE("GPL"); flashcache-3.1.3+git20150701/flashcache-wt/src/flashcache_wt.h000066400000000000000000000114611254507146700234600ustar00rootroot00000000000000/**************************************************************************** * flashcache_wt.h * FlashCache_wt: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@facebook.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ /* Like ASSERT() but always compiled in */ #define VERIFY(x) do { \ if (unlikely(!(x))) { \ dump_stack(); \ panic("VERIFY: assertion (%s) failed at %s (%d)\n", \ #x, __FILE__ , __LINE__); \ } \ } while(0) #define DMC_DEBUG 0 #define DMC_DEBUG_LITE 0 #define DM_MSG_PREFIX "flashcache-wt" #define DMC_PREFIX "flashcache-wt: " #if DMC_DEBUG #define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) #else #define DPRINTK( s, arg... ) #endif #if DMC_DEBUG_LITE #define DPRINTK_LITE( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) #else #define DPRINTK_LITE( s, arg... ) #endif #define READCACHE 1 #define WRITECACHE 2 #define READSOURCE 3 #define WRITESOURCE 4 #define SOURCEIO_DONE 5 #define READCACHE_DONE 6 /* Default cache parameters */ #define DEFAULT_CACHE_SIZE 65536 #define DEFAULT_CACHE_ASSOC 512 #define DEFAULT_BLOCK_SIZE 8 /* 4 KB */ #define CONSECUTIVE_BLOCKS 512 /* States of a cache block */ #define INVALID 0 #define VALID 1 /* Valid */ #define INPROG 2 /* IO (cache fill) is in progress */ #define CACHEREADINPROG 3 /* cache read in progress, don't recycle */ #define INPROG_INVALID 4 /* Write invalidated during a refill */ #define DEV_PATHLEN 128 /* * Cache context */ struct cache_c { struct dm_target *tgt; struct dm_dev *disk_dev; /* Source device */ struct dm_dev *cache_dev; /* Cache device */ spinlock_t cache_spin_lock; struct cacheblock *cache; /* Hash table for cache blocks */ u_int8_t *cache_state; u_int32_t *set_lru_next; int write_around_mode; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) struct dm_io_client *io_client; /* Client memory pool*/ #endif sector_t size; /* Cache size */ unsigned int assoc; /* Cache associativity */ unsigned int block_size; /* Cache block size */ unsigned int block_shift; /* Cache block size in bits */ unsigned int block_mask; /* Cache block mask */ unsigned int consecutive_shift; /* Consecutive blocks size in bits */ wait_queue_head_t destroyq; /* Wait queue for I/O completion */ atomic_t nr_jobs; /* Number of I/O jobs */ /* Stats */ unsigned long reads; /* Number of reads */ unsigned long writes; /* Number of writes */ unsigned long cache_hits; /* Number of cache hits */ unsigned long replace; /* Number of cache replacements */ unsigned long wr_invalidates; /* Number of write invalidations */ unsigned long rd_invalidates; /* Number of read invalidations */ unsigned long cached_blocks; /* Number of cached blocks */ #ifdef FLASHCACHE_WT_CHECKSUMS unsigned long checksum_store; unsigned long checksum_valid; unsigned long checksum_invalid #endif /* FLASHCACHE_WT_CHECKSUMS */ unsigned long cache_wr_replace; unsigned long uncached_reads; unsigned long uncached_writes; unsigned long cache_reads, cache_writes; unsigned long disk_reads, disk_writes; char cache_devname[DEV_PATHLEN]; char disk_devname[DEV_PATHLEN]; }; /* Cache block metadata structure */ struct cacheblock { sector_t dbn; /* Sector number of the cached block */ #ifdef FLASHCACHE_WT_CHECKSUMS u_int64_t checksum; #endif /* FLASHCACHE_WT_CHECKSUMS */ }; /* Structure for a kcached job */ struct kcached_job { struct list_head list; struct cache_c *dmc; struct bio *bio; /* Original bio */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region disk; struct io_region cache; #else struct dm_io_region disk; struct dm_io_region cache; #endif int index; int rw; int error; }; #define FLASHCACHE_WT_MIN_JOBS 1024 /* DM async IO mempool sizing */ #define FLASHCACHE_ASYNC_SIZE 1024 /* Number of pages for I/O */ #define FLASHCACHE_COPY_PAGES (1024) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) #define flashcache_bio_endio(BIO, ERROR) bio_endio((BIO), (BIO)->bi_size, (ERROR)) #else #define flashcache_bio_endio(BIO, ERROR) bio_endio((BIO), (ERROR)) #endif flashcache-3.1.3+git20150701/flashcache-wt/src/utils/000077500000000000000000000000001254507146700216515ustar00rootroot00000000000000flashcache-3.1.3+git20150701/flashcache-wt/src/utils/flashcache_wt_create.c000066400000000000000000000132451254507146700261400ustar00rootroot00000000000000/* * Copyright (c) 2010, Facebook, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include typedef u_int64_t sector_t; void usage(char *pname) { fprintf(stderr, "Usage: %s [-b block size] [ -s cache size] cachedev ssd_devname disk_devname\n", pname); fprintf(stderr, "Usage : %s Default units for -b, -s are sectors, use k/m/g allowed\n", pname); exit(1); } char *pname; char buf[512]; char dmsetup_cmd[8192]; int verbose = 0; int force = 0; static sector_t get_block_size(char *s) { sector_t size; char *c; size = strtoll(s, NULL, 0); for (c = s; isdigit(*c); c++) ; switch (*c) { case '\0': break; case 'k': size = (size * 1024) / 512; break; default: fprintf (stderr, "%s: Unknown block size type %c\n", pname, *c); exit (1); } if (size & ~size) { fprintf(stderr, "%s: Block size must be a power of 2\n", pname); exit(1); } return size; } static sector_t get_cache_size(char *s) { sector_t size; char *c; size = strtoll(s, NULL, 0); for (c = s; isdigit (*c); c++) ; switch (*c) { case '\0': break; case 'k': size = (size * 1024) / 512; break; case 'm': size = (size * 1024 * 1024) / 512; break; case 'g': size = (size * 1024 * 1024 * 1024) / 512; break; default: fprintf (stderr, "%s: Unknown cache size type %c\n", pname, *c); exit (1); } return size; } static int module_loaded(void) { FILE *fp; char line[8192]; int found = 0; fp = fopen("/proc/modules", "ro"); while (fgets(line, 8192, fp)) { char *s; s = strtok(line, " "); if (!strcmp(s, "flashcache_wt")) { found = 1; break; } } return found; } static void load_module(void) { FILE *fp; char line[8192]; int found = 0; if (module_loaded()) { if (verbose) fprintf(stderr, "Flashcache Module already loaded\n"); return; } if (verbose) fprintf(stderr, "Loading Flashcache Module\n"); system("modprobe flashcache_wt"); if (!module_loaded()) { fprintf(stderr, "Could not load Flashcache Module\n"); exit(1); } } main(int argc, char **argv) { int disk_fd, c; char *disk_devname, *ssd_devname, *cachedev; sector_t block_size = 0, cache_size = 0; sector_t disk_devsize; int write_around = 0; pname = argv[0]; while ((c = getopt(argc, argv, "fs:b:vr")) != -1) { switch (c) { case 's': cache_size = get_cache_size(optarg); break; case 'b': block_size = get_block_size(optarg); /* Block size should be a power of 2 */ break; case 'v': verbose = 1; break; case 'f': force = 1; break; case 'r': write_around = 1; break; case '?': usage(pname); } } if (optind == argc) usage(pname); if (block_size == 0) block_size = 8; /* 4KB default blocksize */ cachedev = argv[optind++]; if (optind == argc) usage(pname); ssd_devname = argv[optind++]; if (optind == argc) usage(pname); disk_devname = argv[optind]; disk_fd = open(disk_devname, O_RDONLY); if (disk_fd < 0) { fprintf(stderr, "%s: Failed to open %s\n", pname, disk_devname); exit(1); } if (ioctl(disk_fd, BLKGETSIZE, &disk_devsize) < 0) { fprintf(stderr, "%s: Cannot get disk size %s\n", pname, disk_devname); exit(1); } printf("cachedev %s, ssd_devname %s, disk_devname %s\n", cachedev, ssd_devname, disk_devname); printf("cache mode %s, block_size %lu, cache_size %lu\n", ((write_around) ? "WRITE_AROUND" : "WRITE_THRU"), block_size, cache_size); sprintf(dmsetup_cmd, "echo 0 %lu flashcache-wt %s %s %d %lu ", disk_devsize, disk_devname, ssd_devname, write_around, block_size); if (cache_size > 0) { char cache_size_str[4096]; sprintf(cache_size_str, "%lu ", cache_size); strcat(dmsetup_cmd, cache_size_str); } /* Go ahead and create the cache. * XXX - Should use the device mapper library for this. */ strcat(dmsetup_cmd, "| dmsetup create "); strcat(dmsetup_cmd, cachedev); strcat(dmsetup_cmd, "\n"); load_module(); if (verbose) fprintf(stderr, "Creating FlashCache_wt Volume : %s", dmsetup_cmd); system(dmsetup_cmd); } flashcache-3.1.3+git20150701/man/000077500000000000000000000000001254507146700157645ustar00rootroot00000000000000flashcache-3.1.3+git20150701/man/Makefile000066400000000000000000000004671254507146700174330ustar00rootroot00000000000000#!/usr/bin/make -f # -*- makefile -*- PREFIX ?= /usr MANPAGES := $(patsubst %.mdwn, %.8, $(wildcard *.mdwn)) all: $(MANPAGES) clean: $(RM) $(MANPAGES) $(MANPAGES): pandoc --standalone --to=man --output=$@ $(@:.8=.mdwn) install: $(MANPAGES) install -m0644 $(MANPAGES) $(DESTDIR)${PREFIX}/share/man/man8 flashcache-3.1.3+git20150701/man/flashcache.mdwn000066400000000000000000000040541254507146700207370ustar00rootroot00000000000000% FLASHCACHE(7) flashcache % % September 2011 # NAME flashcache - overview of operation # SYNOPSIS modprobe flashcache Flashcache consists of a kernel module and userspace utilities. The module is named "flashcache". The kernel module can be controlled with *flashcache_create*(8), *flashcache_load*(8) and *flashcache_destroy*(8) utilities. # DESCRIPTION Flashcache is a block cache for Linux, built as a kernel module, using the device mapper. Flashcache supports **writeback**, **writethrough** and **writearound** caching modes. See *flashcache_create*(8) for a description of caching modes. Flascache allows one to use a fast block device such as Solid State Drives (SSD) as cache to accelerate a slower drive used as backstore. Originally, Flashcache was built as a block device cache for I/O intensive database applications, for example to run an accelerated InnoDB store for *mysqld*(8), but it can be used as general purpose backend for filesystems as well. # USAGE Before using Flashcache, it might be a good idea to check if device mapper works properly. Assuming the partition /dev/md0p1 shall be used as flash cache, one may try to create a linear device with following command: **echo 0 1024 linear /dev/md0p1 0 | dmsetup create tmpdisk** It this works, *flashcache_create* should be able to create its device. Remove tmpdisk with command: **dmsetup remove tmpdisk** Before creating a flashcache device using the *flashcache_create* command, the deivce must not be mounted. # SEE ALSO `flashcache_create`(8), `flashcache_load`(8), `flashcache_destroy`(8) *README* and other documents in **/usr/share/doc/flashcache-utils**. In particular, see **flashcache-sa-guide.txt** for configuring the flashcache kernel module through its */proc* interface. The flashcache source code and all documentation may be downloaded from . # AUTHORS Flashcache is developed by Mohan Srinivasan This man page was written by Dmitry Smirnov for Debian GNU/Linux (but may be used by others). flashcache-3.1.3+git20150701/man/flashcache_create.mdwn000066400000000000000000000072251254507146700222650ustar00rootroot00000000000000% FLASHCACHE_CREATE(8) flashcache_create % % September 2011 # NAME flashcache_create - create a new flashcache volume # DESCRIPTION **flashcache_create** initializes a new flashcache volume from unformated block devices. It creates flashcache meta data and provides new volumes though their volume mappings. # SYNOPSIS flashcache_create -p *back*|*around*|*thru* [-s *cache size*] [-b *block size*] [-v] *cachedevname* *cache_devname* *disk_devname* # OPTIONS -v : verbose -p : *cache mode*. Required argument. Specify any supported option: (write-)**back**, (write-)**thru** or (write-)**around**. Their respective implications are denoted below. -s : *cache size*. Optional argument. By default, flashcache will use the auto-detected full cache device size. When present, the given cache size is used instead. The expected units are sectors, however any value can be suffixed by "k", "m" or "g" to interpret the argument in kilo-, mega- or gigabytes respectively. -b : *block size*. Optional argument. Defaults to 4KB. Must be a power of 2. The default units is sectors. However, *k* can be specified as unit type as well. (A 4KB blocksize is the correct choice for the vast majority of applications. -f : force create. Bypass all sanity checks (for example for the sector size). Use with care. # CACHING MODES The following caching modes are supported: *Writethrough* (**thru**) - safest mode, all writes are cached to the cache device but are also being written to disk immediately. If the used cache device has a lower write performance than the backend disk (many early generation SSD drives manufactured between 2008-2010 are known for such a poor write performance) enabling the writethrough mode may decrease the system write performance. All disk reads are cached (tunable through flashcache's */proc* interface). *Writearound* (**ardound**) - again, very safe, writes are not written to the cache device, but directly to the backend disk instead. Disk blocks will only be cached after they are read. All disk reads are cached (tunable through flashcache's */proc* interface). *Writeback* (**back**) - fastest mode but less safe. Writes only go to the cache device initially, and are being written to the backend disk later, depending on configured system policie. All disk reads are cached (tunable through flashcache's */proc* interface). # CACHE PERSISTENCE Writethru and Writearound caches are not persistent across a device removal or a reboot. Only Writeback caches are persistent across device removals and reboots. This reinforces 'writeback is fastest', 'writethrough is safest'. # EXAMPLES **flashcache_create** -p back -s 1g -b 4k cachedev /dev/sdc /dev/sdb : Creates a 1GB writeback cache volume with a 4KB block size on the cache device /dev/sdc to cache the disk volume /dev/sdb. The name of the device created is "cachedev". **flashcache_create** -p thru -s 2097152 -b 8 cachedev /dev/sdc /dev/sdb : Same as above but creates a write through cache with units specified in sectors instead. The name of the device created is "cachedev". # SEE ALSO `flashcache_load`(8), `flashcache_destroy`(8) *README* and other documents in **/usr/share/doc/flashcache-utils**. In particular, see **flashcache-sa-guide.txt** for configuring the flashcache kernel module through its */proc* interface. The flashcache source code and all documentation may be downloaded from . # AUTHORS Flashcache is developed by Mohan Srinivasan This man page was written by Dmitry Smirnov for Debian GNU/Linux (but may be used by others). flashcache-3.1.3+git20150701/man/flashcache_destroy.mdwn000066400000000000000000000015751254507146700225150ustar00rootroot00000000000000% FLASHCACHE_DESTROY(8) flashcache_destroy % % September 2011 # NAME flashcache_destroy - destroy an existing flashcache volume # DESCRIPTION The purpose of the **flashcache_destroy** command is to Clear all meta data from cache device. Beware, this will cause data loss on the affected devices. # SYNOPSIS flashcache_destroy *cache_devname* # EXAMPLE flashcache_destroy */dev/sdc* : Destroy the existing cache on /dev/sdc. All data is lost! # SEE ALSO `flashcache_create`(8), `flashcache_load`(8) *README* and other documents in **/usr/share/doc/flashcache-utils** The flashcache source code and all documentation may be downloaded from . # AUTHORS Flashcache is developed by Mohan Srinivasan This man page was written by Dmitry Smirnov for Debian GNU/Linux (but may be used by others). flashcache-3.1.3+git20150701/src/000077500000000000000000000000001254507146700160005ustar00rootroot00000000000000flashcache-3.1.3+git20150701/src/Makefile000066400000000000000000000044451254507146700174470ustar00rootroot00000000000000COMMIT_REV ?= $(shell git describe --always --abbrev=12) KERNEL_SOURCE_VERSION ?= $(shell uname -r) KERNEL_TREE ?= /lib/modules/$(KERNEL_SOURCE_VERSION)/build EXTRA_CFLAGS += -I$(KERNEL_TREE)/drivers/md -I./ -DCOMMIT_REV="\"$(COMMIT_REV)\"" EXTRA_CFLAGS += -I$(KERNEL_TREE)/include/ -I$(KERNEL_TREE)/include/linux # Check for RHEL/CentOS RHEL5_VER ?= $(shell if [ -e /etc/redhat-release ]; then grep 5.[0-9] /etc/redhat-release; else false; fi) RHEL5_SETUP := ifneq "$(RHEL5_VER)" "" RHEL5_SETUP := rhel5-setup RHEL5_SPEC := /usr/src/redhat/SPECS/kernel.spec RHEL5_TREE := /usr/src/redhat/BUILD/kernel-2.6.18/linux-$(shell uname -r).$(shell uname -i) RHEL5_SRC := /usr/src/kernels/$(shell uname -r)-$(shell uname -i) KERNEL_TREE := $(RHEL5_TREE) endif # Check for OpenVZ (/proc/vz) OPENVZ_VER ?= $(shell if [ -e /proc/vz ]; then grep 5.[0-9] /etc/redhat-release; else false; fi) ifneq "$(OPENVZ_VER)" "" RHEL5_SPEC := /usr/src/redhat/SPECS/kernel-ovz.spec RHEL5_TREE := /usr/src/redhat/BUILD/ovzkernel-2.6.18/linux-$(shell uname -r).$(shell uname -i) KERNEL_TREE := $(RHEL5_TREE) endif obj-m += flashcache.o flashcache-objs := flashcache_conf.o flashcache_main.o flashcache_subr.o flashcache_ioctl.o flashcache_procfs.o flashcache_reclaim.o flashcache_kcopy.o .PHONY: all all: modules utils .PHONY: modules modules: $(RHEL5_SETUP) make -C $(KERNEL_TREE) M=$(PWD) modules V=0 .PHONY: utils utils: make -C utils all .PHONY: modules_install modules_install: modules install -o root -g root -m 0755 -d $(DESTDIR)/lib/modules/$(KERNEL_SOURCE_VERSION)/extra/flashcache/ install -o root -g root -m 0755 flashcache.ko $(DESTDIR)/lib/modules/$(KERNEL_SOURCE_VERSION)/extra/flashcache/ depmod -a $(KERNEL_SOURCE_VERSION) .PHONY: utils_install utils_install: utils make -C utils install .PHONY: ocf_install ocf_install: make -C ocf install .PHONY: install install: modules_install utils_install ocf_install .PHONY: clean clean: make -C $(KERNEL_TREE) M=$(PWD) clean make -C utils clean .PHONY: rhel5-setup rhel5-setup: $(RHEL5_TREE) make -C $(RHEL5_TREE) oldconfig ; \ make -C $(RHEL5_TREE) prepare modules_prepare ln -s -f $(RHEL5_SRC)/Module.symvers $(RHEL5_TREE)/Module.symvers $(RHEL5_TREE): rpmbuild -bp --target=`uname -m` $(RHEL5_SPEC) 2>&1 | tee `dirname $(RHEL5_SPEC)`/prep.log flashcache-3.1.3+git20150701/src/dkms.conf000066400000000000000000000003061254507146700176040ustar00rootroot00000000000000BUILT_MODULE_NAME=flashcache DEST_MODULE_LOCATION=/kernel/drivers/block PACKAGE_NAME=flashcache PACKAGE_VERSION= AUTOINSTALL=yes REMAKE_INITRD=yes MAKE="KERNEL_TREE=$kernel_source_dir make modules" flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/000077500000000000000000000000001254507146700216375ustar00rootroot00000000000000flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/10-flashcache.rules000066400000000000000000000004301254507146700252070ustar00rootroot00000000000000# Written by John Newbigin # We want these rules to run before LVM so we can # Intercerpt pv if necessary (at least I think so) SUBSYSTEM!="block", GOTO="fc_end" ACTION!="add|change", GOTO="fc_end" RUN+="/sbin/fc_scan $env{DEVNAME}" LABEL="fc_end" flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/000077500000000000000000000000001254507146700240715ustar00rootroot00000000000000flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/63-flashcache.rules000066400000000000000000000005171254507146700274570ustar00rootroot00000000000000# Written by John Newbigin # Based on 64-lvm.rules # We want these rules to run before LVM so we can # Intercerpt pv if necessary (at least I think so) SUBSYSTEM!="block", GOTO="fc_end" ACTION!="add|change", GOTO="fc_end" RUN+="/sbin/initqueue --settled --onetime --unique /sbin/fc_scan" LABEL="fc_end" flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/fc_scan000077500000000000000000000032761254507146700254230ustar00rootroot00000000000000#!/bin/sh . /lib/dracut-lib.sh # we will read a config file and set up the flashcache # the config file is generated by the parse module info "Running fc_scan" if [ -f /etc/flashcache.conf ] ; then for fc_conf in $(cat /etc/flashcache.conf) ; do info "Starting flashcache for $fc_conf" fc_dev="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_ssd="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_name="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" if [ -e "$fc_dev" -a -e "$fc_ssd" ] ; then if [ ! -e "/dev/mapper/$fc_name" ] ; then if [ "$fc_conf" = "back" ] ; then # how do we know if the load worked? # If the load fails, assume we need to create a new one (first use) flashcache_load "$fc_ssd" "$fc_name" || \ flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" elif [ "$fc_conf" = "none" ] ; then # We just want to remove any existing writeback cache # do we need some safety to not remove a dirty cache? flashcache_destroy "$fc_ssd" else # if the create fails is might be because there is an old writeback header # what happens if it is dirty? flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" || \ ( flashcache_destroy "$fc_ssd" && \ flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" ) fi else info "Already active" fi else info "Devices not ready" fi done fi info "fc_scan done" unset fc_conf unset fc_name unset fc_ssd unset fc_dev flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/install000077500000000000000000000004361254507146700254700ustar00rootroot00000000000000inst flashcache dracut_install /sbin/flashcache_create dracut_install /sbin/flashcache_load dracut_install /sbin/flashcache_destroy inst "$moddir/fc_scan" "/sbin/fc_scan" inst_hook cmdline 29 "$moddir/parse-flashcache.sh" inst_rules "$moddir/63-flashcache.rules" 63-flashcache.rules flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/installkernel000077500000000000000000000000251254507146700266630ustar00rootroot00000000000000instmods =flashcache flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/90flashcache/parse-flashcache.sh000077500000000000000000000017521254507146700276260ustar00rootroot00000000000000# We need to make a (temporary) config file so the # udev half can do something when it finds the right # devices # real_device:ssd_device:name[:mode] # /dev/vda2:/dev/vdb:fc_vda2:thru # if mode is not specified, 'thru' is used # although back knows which real_device and name to use # we still want to know so we can make sure the device is availale # but has not already been started for fc_conf in $(getargs rd_FLASHCACHE=); do #echo "FLASHCACHE for $conf" fc_dev="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_ssd="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_name="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" if [ "$fc_conf" = "back" ] ; then fc_mode=back elif [ "$fc_conf" = "around" ] ; then fc_mode=around elif [ "$fc_conf" = "none" ] ; then fc_mode=none else fc_mode=thru fi echo "$fc_dev:$fc_ssd:$fc_name:$fc_mode" >> /etc/flashcache.conf done unset fc_dev unset fc_ssd unset fc_name unset fc_mode unset fc_conf flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/COPYING000066400000000000000000000431031254507146700226730ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/Makefile000066400000000000000000000014571254507146700233060ustar00rootroot00000000000000VERSION=`cat version` VERSION_CVS=`cat version | sed 's/\./_/'` APP=dracut-flashcache RELEASE=1 # $@ is the target: # $? out of date dependencies # $< is the source file # $(OBJS:.o=.c) converts data.o io.o main.o into data.c io.c main.c $(APP).spec: $(APP).spec.in version echo "# Do not edit this file. Edit $(APP).spec.in" > $@ echo "%define ver $(VERSION)" >> $@ echo "%define rel $(RELEASE)" >> $@ cat $(APP).spec.in >> $@ rtag: cvs rtag v$(VERSION_CVS) $(APP) tarball: $(APP).spec if [ -L ../$(APP)-$(VERSION) ] ; then rm ../$(APP)-$(VERSION) ; fi ln -s $(APP) ../$(APP)-$(VERSION) tar -chzf ../$(APP)-$(VERSION).tgz -C ../ --exclude CVS --exclude \*.swp $(APP)-$(VERSION) if [ -L ../$(APP)-$(VERSION) ] ; then rm ../$(APP)-$(VERSION) ; fi rpm: tarball rpmbuild -ta ../$(APP)-$(VERSION).tgz flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/README000066400000000000000000000237641254507146700225330ustar00rootroot00000000000000dracut-flashcache version 0.3 ============================= This is a dracut module which will enable you to use flashcache on your root (/) filesystem or LVM PV. Written by John Newbigin It has been written and tested on CentOS-6 and should work the same on RHEL6. It will probably work on Fedora but they might have changed things. WARNING ======= A mistake here could delete all your data. Be careful! Preparation =========== You don't actually need a Solid State Drive (SSD), any disk will work. These instructions will use the term SSD to represent your selected 'cache' disk. Start by physically installing your disk and make sure it is detected (in /proc/partitions). Do a backup of your system. If you accidently get the SSD and the HDD round the wrong way all your data could go in a flash (no pun intended). You will need the following RPM packages installed * kmod-flashcache from ELRepo - This is the actual 'driver' http://elrepo.org/tiki/kmod-flashcache * flashcache-utils from ELRepo - These are tools required to 'start the driver' http://elrepo.org/tiki/flashcache-utils * dracut-flashcache (this package) - This will use the flashcache-utils during boot http://www.chrysocome.net/download Planning ======== 1. Choose where to store your cache. I chose to partition my SSD. This is not necessary but I want to use it to cache two separate LVM groups so I need two partitions. I use partition type '0xda Non-FS data' but there is no standard and it does not really matter. 2. Choose what to store in your cache. If you are not using LVM then you probably want to cache an entire partition (or disk? TODO: can a flashcache device have partitions?). Chances are for a simple setup it is /dev/sda2 If you are using LVM then you can choose to cache a Physical Volume or Logical Volume. If you only have one PV then that is a handy thing to cache because your filesystem device names will be the same so you have less configuration to do. If you have multiple PV then you can cache one LV instead. This will speed up all of your PV at the same time. Your filesystem device will change which might mean more work. If you have multiple PV and multiple LV or just want to have multiple caches, not to worry, just partition your SSD and set them up one at a time. Setup ===== 1. Edit /etc/lvm/lvm.conf and blacklist your SSD with a command like this: filter = [ "r|/dev/sdb|" ] # This will prevent LVM from seeing the signature at the start of the device and thinking it should scan this device (this is a bad thing) # We are relying on the regex to match all partitions on the disk as well as the disk # (Do not include the "a/.*/" in your filter or your r will not be processed) 2. Build a new initd which will contain your updated lvm.conf mkinitrd -f /boot/initramfs-$(uname -r).img $(uname -r) 3. Edit your grub.conf and add this to the kernel line: rd_FLASHCACHE=/dev/HDD:/dev/SSD:fc_HDD 4. In grub.conf, if you have root=/dev/HDD then change it so that root=/dev/mapper/fc_HDD Reboot! Once you boot up successfuly, it will still take some time for your recent disk access to fill the cache. Don't expect instant results. Advanced Setup ============== Once you have writethru caching working you can try writeback by editing grub.conf to have rd_FLASHCACHE=/dev/HDD:/dev/SSD:fc_HDD:back This will keep your cache over a reboot. This gives faster boot times but has some risks associated with it. Uninstalling ============ If you set up a writeback cache and you then want to remove it, you can safly do this by editing grub.conf and setting it to type none. After a reboot it will be gone and then you can edit grub.conf and remove the rd_FLASHCACHE= option totally. Examples ======== Using basic LVM and want to cache your PV root=/dev/mapper/vg0-lv_root rd_FLASHCACHE=/dev/sda2:/dev/sdb:fc_sda2 Using basic LVM and want to cache your root LV only: root=/dev/mapper/fc_root rd_FLASHCACHE=/dev/mapper/vg0-lv_root:dev/sdb:fc_root Using software RAID and want to cache the raid dev: rd_FLASHCACHE=/dev/md0:/dev/sdb:fc_md0 Notes ===== I don't think we need to lvm blacklist the real disk. Once the flashcache is loaded, lvm won't be able to use the device. This allows a fall-back in the case that the flashcache does not load. Only if you have a dirty writeback cache will this be a problem... (and it could be a BIG problem, particularly if you flush the cache in the future after you have done an fsck on the real disk device!) My examples use fc_... as the cache name. I think this helps to remember where the data is coming from. The actual string you use is up to you. Writeback mode does not have any on disk header/signature so there is no safety if you make a mistake with your device names. Be careful. What if you want flashcache for a disk which is not needed at boot time? Should we have a config file or should there be a run time udev rule to read the command line? For now, specify them all in grub.conf and the runtime udev rule will process them once your disk subsystem is ready so you can use software RAID etc. dracut-flashcache Copyright (C) 2012 John Newbigin This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. == Old Notes== dracut module for flashcache Written by John Newbigin jnewbigin@chrysocome.net This will enable you to use the elrepo packaged flashcache module with your root filesystem and/or physical volumes. Step : Install the required software yum install flashcache-utils kmod-flashcache dracut-flashcache Step : Blacklist your SSD To prevent LVM from getting confused you should configure it so that it never tries to find physical volumes on your ssd disk or partitions (depending on your setup). To do this, edit /etc/lvm/lvm.conf and change your 'filter =' entry. My ssd is /dev/sdd so my filter looks like this: filter = [ "r|/dev/sdd|" ] I could also do this: filter = [ "r|$/dev/sdd1^|", "r|$/dev/sdd2^|" ] If I wanted to be more specific. (Do not include the "a/.*/" or your r will not be processed) I don't think we need to blacklist the real disk. Once the flashcache is loaded, lvm won't be able to use the device. This allows a fallback in the case that the flashcache does not load. Only if you have a dirty writeback cache will this be a problem..... Step : Build a new initrd We need to get the flashcache files and your new lvm config into the initrd mkinitrd -f /boot/initramfs-$(uname -r).img $(uname -r) Step : Edit grub.conf You can of course do this from grub but it is much easier to do with a text editor. Add this to the end of your kernel line: rd_FLASHCACHE=/dev/my_real_disk:/dev/my_ssd:fc_new_name You must substitute the correct values for my_real_disk, my_ssd and fc_new_name my_real_disk is where you store your data. It might be a disk or partition or a logical volume. eg: /dev/sda2 (partition) my_ssd is your high speed disk (probably an ssd). eg: /dev/sdb1 (a partition) fc_new_name is the name used to access the cached device. I recommend fc_ followed by the original name. eg: fc_sda2 (don't use /dev here) Note: it is possible for disk names to change so it might be safer to use a unique name for your devices, something from /dev/disk/by-id/ Unfortunatly I use : as the seperator so you can't use /dev/disk/by-path/ There is also an optional 4th parameter which I will cover below. Step : Reboot Using write back ================ The default mode is writethrough (or thru) which will ensure that your data is safely stored on your real disk. This is the safest option because if you have a crash/powerfail, ssd fail or boot problem your data is safe. For better write performance and read performance from boot, you can enable writeback mode. This is relativly safe. The problem is if you crash or powerfail and then have an ssd fail or boot problem then you can loose data. This may just cause a loss of recent changes but it could also cause filesystem corruption and a total loss of everything. (What would happen if: boot with writeback. crash. Boot without any flashcache & repair filesystem (say you accidentially boot into a live CD). Then reboot and re-activate your dirty writeback. Stale data is now written onto your disk causing fresh corruption). Don't enable this until you know that: - You can boot/reboot succssfully with write thru - You have a UPS - You have backups of your data To enable writeback mode, add :back to the end of your rd_FLASHCACHE settings. You can change this in the future and revert to thru but you must do a clean reboot to correctly remove the writeback cache so you don't loose data. ie.i To remove a writeback cache: * Boot with :back * Do a clean shut down * Boot with :thru * shut down * Boot with no rd_FLASHCACHE Finally, if you enable fast_remove, every reboot may (will?) leave data in the cache only so you must reboot in order to save your valuable data. In this configuration you can't change the type to thru or you loose your data. First, disable fast_remove, then reboot, then reboot again and remove the writeback cache. HDD SSD MODE ACTION x x - thru x x new thru x x - back x x new back x x - auto x x new auto x x - none x x back back auto x back back x x back thru x x back auto auto x back auto x x back none x x new thru x x new back x x new auto x x new none flashcache_init /dev/ssd Write a header to the SSD to identify it as a candidate for use at create time. flashcache_info /dev/xxx Query the dev: - N/A - Clean - Dirty - Fastclean? - Unstable? - New flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/build.sh000066400000000000000000000001501254507146700232660ustar00rootroot00000000000000mkinitrd -f /boot/initramfs-$(uname -r).img $(uname -r) zcat /boot/initramfs-$(uname -r).img | cpio -it flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/dracut-flashcache.spec000066400000000000000000000031021254507146700260500ustar00rootroot00000000000000# Do not edit this file. Edit dracut-flashcache.spec.in %define ver 0.3 %define rel 1 Summary: Dracut modules to build a dracut initramfs with flashcache support Name: dracut-flashcache Version: %{ver} %if %{rel} Release: %{rel}%{?dist:%{dist}} %else Release: 1%{?dist:%{dist}} %endif Group: System Environment/Base Source0: %{name}-%{ver}.tgz License: GPL BuildRoot: %{_tmppath}/%{name}-root Packager: John Newbign (jnewbigin@chrysocome.net) Vendor: Chrysocome Requires: dracut #Requires: flashcache-utils #Requires: kmod-flashcache BuildArch: noarch %description This package enables creating and starting flashcache in initrd so you can use flashcache on your root filesystem or physical volumes. It also enables using flashcache on other disks which are set up during rc.sysinit (via udev). %prep %setup %build %install [ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT mkdir -p $RPM_BUILD_ROOT/sbin/ mkdir -p $RPM_BUILD_ROOT/lib/udev/rules.d/ mkdir -p $RPM_BUILD_ROOT/usr/share/dracut/modules.d/ cp fc_scan $RPM_BUILD_ROOT/sbin/ cp 10-flashcache.rules $RPM_BUILD_ROOT/lib/udev/rules.d/ cp -r 90flashcache $RPM_BUILD_ROOT/usr/share/dracut/modules.d/ %files %defattr(-,root,root,0755) /sbin/fc_scan /lib/udev/rules.d/10-flashcache.rules /usr/share/dracut/modules.d/ %doc README %doc COPYING %changelog * Mon Dec 3 2012 John Newbign - 0.3 - Fix up a few fc_scan bugs - Add mode none * Sat Jul 14 2012 John Newbigin - 0.2 - Add scripts for the real udev * Sun Jun 03 2012 John Newbigin - 0.1 - First cut flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/dracut-flashcache.spec.in000066400000000000000000000027521254507146700264670ustar00rootroot00000000000000Summary: Dracut modules to build a dracut initramfs with flashcache support Name: dracut-flashcache Version: %{ver} %if %{rel} Release: %{rel}%{?dist:%{dist}} %else Release: 1%{?dist:%{dist}} %endif Group: System Environment/Base Source0: %{name}-%{ver}.tgz License: GPL BuildRoot: %{_tmppath}/%{name}-root Packager: John Newbign (jnewbigin@chrysocome.net) Vendor: Chrysocome Requires: dracut Requires: flashcache-utils Requires: kmod-flashcache BuildArch: noarch %description This package enables creating and starting flashcache in initrd so you can use flashcache on your root filesystem or physical volumes. It also enables using flashcache on other disks which are set up during rc.sysinit (via udev). %prep %setup %build %install [ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT mkdir -p $RPM_BUILD_ROOT/sbin/ mkdir -p $RPM_BUILD_ROOT/lib/udev/rules.d/ mkdir -p $RPM_BUILD_ROOT/usr/share/dracut/modules.d/ cp fc_scan $RPM_BUILD_ROOT/sbin/ cp 10-flashcache.rules $RPM_BUILD_ROOT/lib/udev/rules.d/ cp -r 90flashcache $RPM_BUILD_ROOT/usr/share/dracut/modules.d/ %files %defattr(-,root,root,0755) /sbin/fc_scan /lib/udev/rules.d/10-flashcache.rules /usr/share/dracut/modules.d/ %doc README %doc COPYING %changelog * Mon Dec 3 2012 John Newbign - 0.3 - Fix up a few fc_scan bugs - Add mode none * Sat Jul 14 2012 John Newbigin - 0.2 - Add scripts for the real udev * Sun Jun 03 2012 John Newbigin - 0.1 - First cut flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/fc_scan000077500000000000000000000047641254507146700231740ustar00rootroot00000000000000#!/bin/sh # This bit of voodoo will get out output into /var/log/messages #npipe=/tmp/$$.tmp #trap "rm -f $npipe" EXIT #mknod $npipe p #logger < $npipe & #exec 1>&- #exec 1>$npipe 2>&1 exec > /dev/console 2>&1 # flashcache_create calls dmsetup which must be in the path export PATH=/sbin:/bin:/usr/sbin:/usr/bin # we will read a config file and set up the flashcache # the config file is generated by the parse module for i in $(cat /proc/cmdline) ; do pram=${i%%=*} if [ "$pram" = "rd_FLASHCACHE" ] ; then conf="$conf ${i#*=}" fi done echo "Running fc_scan $*" udev_dev=$1 #set #if [ -f /etc/flashcache.conf ] ; then # for fc_conf in $(cat /etc/flashcache.conf) ; do for fc_conf in $conf ; do fc_description=$fc_conf fc_dev="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_ssd="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" fc_name="${fc_conf%%:*}" fc_conf="${fc_conf#*:}" if [ "$fc_dev" != "$udev_dev" ] ; then continue fi echo "Starting flashcache for $fc_description" if [ -e "$fc_dev" -a -e "$fc_ssd" ] ; then if [ ! -e "/dev/mapper/$fc_name" ] ; then if [ "$fc_conf" = "back" ] ; then # how do we know if the load worked? # If the load fails, assume we need to create a new one (first use) flashcache_load "$fc_ssd" "$fc_name" || ( echo "Load failed... creating new writeback cache" && \ flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" ) elif [ "$fc_conf" = "none" ] ; then # We just want to remove any existing writeback cache # do we need some safety to not remove a dirty cache? flashcache_destroy "$fc_ssd" else # if the create failes is might be because there is an old writeback header # what happens if it is dirty? echo Calling flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" || ( echo "Create failed... removing old writeback cache" && \ flashcache_destroy "$fc_ssd" && sleep 2 && \ flashcache_create -v -p "$fc_conf" "$fc_name" "$fc_ssd" "$fc_dev" ) fi sleep 1 else echo "Already active" fi else echo "Devices not ready" fi done #fi echo "fc_scan done" unset fc_conf unset fc_name unset fc_ssd unset fc_dev flashcache-3.1.3+git20150701/src/dracut-flashcache-0.3/version000066400000000000000000000000041254507146700232410ustar00rootroot000000000000000.3 flashcache-3.1.3+git20150701/src/flashcache.h000066400000000000000000000660101254507146700202350ustar00rootroot00000000000000/**************************************************************************** * flashcache.h * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #ifndef FLASHCACHE_H #define FLASHCACHE_H #define FLASHCACHE_VERSION 4 #define DEV_PATHLEN 128 #ifdef __KERNEL__ /* Like ASSERT() but always compiled in */ #define VERIFY(x) do { \ if (unlikely(!(x))) { \ dump_stack(); \ panic("VERIFY: assertion (%s) failed at %s (%d)\n", \ #x, __FILE__ , __LINE__); \ } \ } while(0) #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) #define bi_sector bi_iter.bi_sector #define bi_size bi_iter.bi_size #define bi_idx bi_iter.bi_idx #endif #define DMC_DEBUG 0 #define DMC_DEBUG_LITE 0 #define DM_MSG_PREFIX "flashcache" #define DMC_PREFIX "flashcache: " #if DMC_DEBUG #define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) #else #define DPRINTK( s, arg... ) #endif /* * Finegrained locking note : * All of flashcache used to be protected by a single cache_spin_lock. * That has been removed, and per-set spinlocks have been introduced. struct cache_set : set_spin_lock Protects cache set and every cacheblock in the cache set. Can be acquired from softirq paths ! struct cache_md_block_head : md_block_lock Protects state for the metadata block head. Can be acquired from softirq paths ! The following locks protect various state within the dmc. All of these are held for short sections. struct cache_c : ioctl_lock struct cache_c : cache_pending_q_spinlock Lock Ordering. set_spin_lock must be acquired before any of the other locks. set_spin_lock (Acquired in increasing order of sets ! If you must acquire 2 set_spin_locks, acquire the lock on set i before set i+1. Acquiring locks on multiple sets should be done using flashcache_setlocks_multiget/drop). md_block_lock ioctl_lock cache_pending_q_spinlock Important Locking Note : ---------------------- softirq into flashcache (IO completion path) acquires the cache set lock. Therefore *any* * (process context) codepath that acquires any other spinlock after acquiring the cache set spinlock *must* disable irq's. Else, we get an irq holding the cache set lock -> other spinlock and we deadlock on the cache set lock. These locks are all acquired *after* acquiring the cache set spinlocks, which means *EVERY* acquisition of these locks must disable irq's to address the above race ! Every acquisition of md_block_lock ioctl_lock cache_pending_q_spinlock MUST DISABLE IRQs. */ /* * Block checksums : * Block checksums seem a good idea (especially for debugging, I found a couple * of bugs with this), but in practice there are a number of issues with this * in production. * 1) If a flash write fails, there is no guarantee that the failure was atomic. * Some sectors may have been written to flash. If so, the checksum we have * is wrong. We could re-read the flash block and recompute the checksum, but * the read could fail too. * 2) On a node crash, we could have crashed between the flash data write and the * flash metadata update (which updates the new checksum to flash metadata). When * we reboot, the checksum we read from metadata is wrong. This is worked around * by having the cache load recompute checksums after an unclean shutdown. * 3) Checksums require 4 or 8 more bytes per block in terms of metadata overhead. * Especially because the metadata is wired into memory. * 4) Checksums force us to do a flash metadata IO on a block re-dirty. If we * didn't maintain checksums, we could avoid the metadata IO on a re-dirty. * Therefore in production we disable block checksums. */ #if 0 #define FLASHCACHE_DO_CHECKSUMS #endif #if DMC_DEBUG_LITE #define DPRINTK_LITE( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) #else #define DPRINTK_LITE( s, arg... ) #endif /* Number of pages for I/O */ #define FLASHCACHE_COPY_PAGES (1024) /* Default cache parameters */ #define DEFAULT_CACHE_SIZE 65536 #define DEFAULT_CACHE_ASSOC 512 #define DEFAULT_DISK_ASSOC 512 /* 256 KB in 512b sectors */ #define DEFAULT_BLOCK_SIZE 8 /* 4 KB */ #define DEFAULT_MD_BLOCK_SIZE 8 /* 4 KB */ #define DEFAULT_MD_BLOCK_SIZE_BYTES (DEFAULT_MD_BLOCK_SIZE * 512) /* 4 KB */ #define FLASHCACHE_MAX_MD_BLOCK_SIZE 128 /* 64 KB */ #define FLASHCACHE_FIFO 0 #define FLASHCACHE_LRU 1 /* * The LRU pointers are maintained as set-relative offsets, instead of * pointers. This enables us to store the LRU pointers per cacheblock * using 4 bytes instead of 16 bytes. The upshot of this is that we * are required to clamp the associativity at an 8K max. */ #define FLASHCACHE_MIN_ASSOC 256 #define FLASHCACHE_MAX_ASSOC 8192 #define FLASHCACHE_MIN_DISK_ASSOC 256 /* Min Disk Assoc of 128KB in sectors */ #define FLASHCACHE_MAX_DISK_ASSOC 2048 /* Max Disk Assoc of 1MB in sectors */ #define FLASHCACHE_NULL 0xFFFF struct cacheblock; struct cache_set { spinlock_t set_spin_lock; u_int32_t set_fifo_next; u_int32_t set_clean_next; u_int16_t clean_inprog; u_int16_t nr_dirty; u_int16_t dirty_fallow; unsigned long fallow_tstamp; unsigned long fallow_next_cleaning; /* * 2 LRU queues/cache set. * 1) A block is faulted into the MRU end of the warm list from disk. * 2) When the # of accesses hits a threshold, it is promoted to the * (MRU) end of the hot list. To keep the lists in equilibrium, the * LRU block from the host list moves to the MRU end of the warm list. * 3) Within each list, an access will move the block to the MRU end. * 4) Reclaims happen from the LRU end of the warm list. After reclaim * we move a block from the LRU end of the hot list to the MRU end of * the warm list. */ u_int16_t hotlist_lru_head, hotlist_lru_tail; u_int16_t warmlist_lru_head, warmlist_lru_tail; u_int16_t lru_hot_blocks, lru_warm_blocks; #define NUM_BLOCK_HASH_BUCKETS 512 u_int16_t hash_buckets[NUM_BLOCK_HASH_BUCKETS]; u_int16_t invalid_head; }; struct flashcache_errors { int disk_read_errors; int disk_write_errors; int ssd_read_errors; int ssd_write_errors; int memory_alloc_errors; }; struct flashcache_stats { unsigned long reads; /* Number of reads */ unsigned long writes; /* Number of writes */ unsigned long read_hits; /* Number of cache hits */ unsigned long write_hits; /* Number of write hits (includes dirty write hits) */ unsigned long dirty_write_hits; /* Number of "dirty" write hits */ unsigned long replace; /* Number of cache replacements */ unsigned long wr_replace; unsigned long wr_invalidates; /* Number of write invalidations */ unsigned long rd_invalidates; /* Number of read invalidations */ unsigned long pending_inval; /* Invalidations due to concurrent ios on same block */ #ifdef FLASHCACHE_DO_CHECKSUMS unsigned long checksum_store; unsigned long checksum_valid; unsigned long checksum_invalid; #endif unsigned long enqueues; /* enqueues on pending queue */ unsigned long cleanings; unsigned long fallow_cleanings; unsigned long noroom; /* No room in set */ unsigned long md_write_dirty; /* Metadata sector writes dirtying block */ unsigned long md_write_clean; /* Metadata sector writes cleaning block */ unsigned long md_write_batch; /* How many md updates did we batch ? */ unsigned long md_ssd_writes; /* How many md ssd writes did we do ? */ unsigned long pid_drops; unsigned long pid_adds; unsigned long pid_dels; unsigned long expiry; unsigned long front_merge, back_merge; /* Write Merging */ unsigned long uncached_reads, uncached_writes; unsigned long uncached_sequential_reads, uncached_sequential_writes; unsigned long disk_reads, disk_writes; unsigned long ssd_reads, ssd_writes; unsigned long uncached_io_requeue; unsigned long skipclean; unsigned long trim_blocks; unsigned long clean_set_ios; unsigned long force_clean_block; unsigned long lru_promotions; unsigned long lru_demotions; }; struct diskclean_buf_ { struct diskclean_buf_ *next; }; /* * Sequential block history structure - each one * records a 'flow' of i/o. */ struct sequential_io { sector_t most_recent_sector; unsigned long sequential_count; /* We use LRU replacement when we need to record a new i/o 'flow' */ struct sequential_io *prev, *next; }; #define SKIP_SEQUENTIAL_THRESHOLD 0 /* 0 = cache all, >0 = dont cache sequential i/o more than this (kb) */ #define SEQUENTIAL_TRACKER_QUEUE_DEPTH 32 /* How many io 'flows' to track (random i/o will hog many). * This should be large enough so that we don't quickly * evict sequential i/o when we see some random, * but small enough that searching through it isn't slow * (currently we do linear search, we could consider hashed */ /* * Cache context */ struct cache_c { struct dm_target *tgt; struct dm_dev *disk_dev; /* Source device */ struct dm_dev *cache_dev; /* Cache device */ int on_ssd_version; struct cacheblock *cache; /* Hash table for cache blocks */ struct cache_set *cache_sets; struct cache_md_block_head *md_blocks_buf; /* None of these change once cache is created */ unsigned int md_block_size; /* Metadata block size in sectors */ sector_t size; /* Cache size */ unsigned int assoc; /* Cache associativity */ unsigned int block_size; /* Cache block size */ unsigned int block_shift; /* Cache block size in bits */ unsigned int block_mask; /* Cache block mask */ int md_blocks; /* Numbers of metadata blocks, including header */ unsigned int disk_assoc; /* Disk associativity */ unsigned int disk_assoc_shift; /* Disk associativity in bits */ unsigned int assoc_shift; /* Consecutive blocks size in bits */ unsigned int num_sets; /* Number of cache sets */ int cache_mode; int write_only_cache; wait_queue_head_t destroyq; /* Wait queue for I/O completion */ /* XXX - Updates of nr_jobs should happen inside the lock. But doing it outside is OK since the filesystem is unmounted at this point */ atomic_t nr_jobs; /* Number of I/O jobs */ #define SLOW_REMOVE 1 #define FAST_REMOVE 2 atomic_t remove_in_prog; int dirty_thresh_set; /* Per set dirty threshold to start cleaning */ int max_clean_ios_set; /* Max cleaning IOs per set */ int max_clean_ios_total; /* Total max cleaning IOs */ atomic_t clean_inprog; atomic_t sync_index; atomic_t nr_dirty; atomic_t cached_blocks; /* Number of cached blocks */ atomic_t pending_jobs_count; int num_block_hash_buckets; /* Stats */ struct flashcache_stats flashcache_stats; /* Errors */ struct flashcache_errors flashcache_errors; #define IO_LATENCY_GRAN_USECS 250 #define IO_LATENCY_MAX_US_TRACK 10000 /* 10 ms */ #define IO_LATENCY_BUCKETS (IO_LATENCY_MAX_US_TRACK / IO_LATENCY_GRAN_USECS) unsigned long latency_hist[IO_LATENCY_BUCKETS]; unsigned long latency_hist_10ms; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) struct work_struct delayed_clean; #else struct delayed_work delayed_clean; #endif spinlock_t ioctl_lock; /* XXX- RCU! */ unsigned long pid_expire_check; struct flashcache_cachectl_pid *blacklist_head, *blacklist_tail; struct flashcache_cachectl_pid *whitelist_head, *whitelist_tail; int num_blacklist_pids, num_whitelist_pids; unsigned long blacklist_expire_check, whitelist_expire_check; atomic_t hot_list_pct; int lru_hot_blocks; int lru_warm_blocks; spinlock_t cache_pending_q_spinlock; #define PENDING_JOB_HASH_SIZE 32 struct pending_job *pending_job_hashbuckets[PENDING_JOB_HASH_SIZE]; spinlock_t diskclean_list_lock; struct diskclean_buf_ *diskclean_buf_head; spinlock_t kcopy_job_alloc_lock; struct flashcache_copy_job *kcopy_jobs_head; struct cache_c *next_cache; void *sysctl_handle; // DM virtual device name, stored in superblock and restored on load char dm_vdevname[DEV_PATHLEN]; // real device names are now stored as UUIDs char cache_devname[DEV_PATHLEN]; char disk_devname[DEV_PATHLEN]; /* * If the SSD returns errors, in WRITETHRU and WRITEAROUND modes, * bypass the cache completely. If the SSD dies or is removed, * we want to continue sending requests to the device. */ int bypass_cache; /* Per device sysctls */ int sysctl_io_latency_hist; int sysctl_do_sync; int sysctl_stop_sync; int sysctl_dirty_thresh; int sysctl_pid_do_expiry; int sysctl_max_pids; int sysctl_pid_expiry_secs; int sysctl_reclaim_policy; int sysctl_zerostats; int sysctl_error_inject; int sysctl_fast_remove; int sysctl_cache_all; int sysctl_fallow_clean_speed; int sysctl_fallow_delay; int sysctl_skip_seq_thresh_kb; int sysctl_clean_on_read_miss; int sysctl_clean_on_write_miss; int sysctl_lru_hot_pct; int sysctl_lru_promote_thresh; int sysctl_new_style_write_merge; /* Sequential I/O spotter */ struct sequential_io seq_recent_ios[SEQUENTIAL_TRACKER_QUEUE_DEPTH]; struct sequential_io *seq_io_head; struct sequential_io *seq_io_tail; #define FLASHCACHE_WRITE_CLUST_HIST_SIZE 128 unsigned long write_clust_hist[FLASHCACHE_WRITE_CLUST_HIST_SIZE]; unsigned long write_clust_hist_ovf; }; /* kcached/pending job states */ #define READCACHE 1 #define WRITECACHE 2 #define READDISK 3 #define WRITEDISK 4 #define READFILL 5 /* Read Cache Miss Fill */ #define INVALIDATE 6 #define WRITEDISK_SYNC 7 struct kcached_job { struct list_head list; struct cache_c *dmc; struct bio *bio; /* Original bio */ struct job_io_regions { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region disk; struct io_region cache; #else struct dm_io_region disk; struct dm_io_region cache; #endif } job_io_regions; int index; int action; int error; struct flash_cacheblock *md_block; struct page_list pl_base[1]; struct timeval io_start_time; struct kcached_job *next; }; struct pending_job { struct bio *bio; int action; int index; struct pending_job *prev, *next; }; struct flashcache_copy_job { struct list_head list; struct cache_c *dmc; int nr_writes; int reads_completed; int write_kickoff; struct page_list *pl_base; struct page_list *pl_list_head; struct page **page_base; struct kcached_job **job_base; struct job_io_regions_ { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region disk; struct io_region *cache; #else struct dm_io_region disk; struct dm_io_region *cache; #endif } job_io_regions; int error; spinlock_t copy_job_spinlock; struct flashcache_copy_job *next; }; #endif /* __KERNEL__ */ /* Cache Modes */ enum { FLASHCACHE_WRITE_BACK=1, FLASHCACHE_WRITE_THROUGH=2, FLASHCACHE_WRITE_AROUND=3, }; /* States of a cache block */ #define INVALID 0x0001 #define VALID 0x0002 /* Valid */ #define DISKREADINPROG 0x0004 /* Read from disk in progress */ #define DISKWRITEINPROG 0x0008 /* Write to disk in progress */ #define CACHEREADINPROG 0x0010 /* Read from cache in progress */ #define CACHEWRITEINPROG 0x0020 /* Write to cache in progress */ #define DIRTY 0x0040 /* Dirty, needs writeback to disk */ /* * Old and Dirty blocks are cleaned with a Clock like algorithm. The leading hand * marks DIRTY_FALLOW_1. 900 seconds (default) later, the trailing hand comes along and * marks DIRTY_FALLOW_2 if DIRTY_FALLOW_1 is already set. If the block was used in the * interim, (DIRTY_FALLOW_1|DIRTY_FALLOW_2) is cleared. Any block that has both * DIRTY_FALLOW_1 and DIRTY_FALLOW_2 marked is considered old and is eligible * for cleaning. */ #define DIRTY_FALLOW_1 0x0080 #define DIRTY_FALLOW_2 0x0100 #define FALLOW_DOCLEAN (DIRTY_FALLOW_1 | DIRTY_FALLOW_2) #define BLOCK_IO_INPROG (DISKREADINPROG | DISKWRITEINPROG | CACHEREADINPROG | CACHEWRITEINPROG) /* lru_state in cache block */ #define LRU_HOT 0x0001 /* On Hot LRU List */ #define LRU_WARM 0x0002 /* On Warm LRU List */ /* Cache metadata is read by Flashcache utilities */ #ifndef __KERNEL__ typedef u_int64_t sector_t; #endif /* On Flash (cache metadata) Structures */ #define CACHE_MD_STATE_DIRTY 0xdeadbeef #define CACHE_MD_STATE_CLEAN 0xfacecafe #define CACHE_MD_STATE_FASTCLEAN 0xcafefeed #define CACHE_MD_STATE_UNSTABLE 0xc8249756 /* Cache block metadata structure */ struct cacheblock { u_int16_t cache_state; int16_t nr_queued; /* jobs in pending queue */ u_int16_t lru_prev, lru_next; u_int8_t use_cnt; u_int8_t lru_state; sector_t dbn; /* Sector number of the cached block */ u_int16_t hash_prev, hash_next; #ifdef FLASHCACHE_DO_CHECKSUMS u_int64_t checksum; #endif } __attribute__((packed)); struct flash_superblock { sector_t size; /* Cache size */ u_int32_t block_size; /* Cache block size */ u_int32_t assoc; /* Cache associativity */ u_int32_t cache_sb_state; /* Clean shutdown ? */ char cache_devname[DEV_PATHLEN]; /* Contains dm_vdev name as of v2 modifications */ sector_t cache_devsize; char disk_devname[DEV_PATHLEN]; /* underlying block device name (use UUID paths!) */ sector_t disk_devsize; u_int32_t cache_version; u_int32_t md_block_size; u_int32_t disk_assoc; u_int32_t write_only_cache; }; /* * We do metadata updates only when a block trasitions from DIRTY -> CLEAN * or from CLEAN -> DIRTY. Consequently, on an unclean shutdown, we only * pick up blocks that are marked (DIRTY | CLEAN), we clean these and stick * them in the cache. * On a clean shutdown, we will sync the state for every block, and we will * load every block back into cache on a restart. * * Note: When using larger flashcache metadata blocks, it is important to make * sure that a flash_cacheblock does not straddle 2 sectors. This avoids * partial writes of a metadata slot on a powerfail/node crash. Aligning this * a 16b or 32b struct avoids that issue. * * Note: If a on-ssd flash_cacheblock does not fit exactly within a 512b sector, * (ie. if there are any remainder runt bytes), logic in flashcache_conf.c which * reads and writes flashcache metadata on create/load/remove will break. * * If changing these, make sure they remain a ^2 size ! */ #ifdef FLASHCACHE_DO_CHECKSUMS struct flash_cacheblock { sector_t dbn; /* Sector number of the cached block */ u_int64_t checksum; u_int32_t cache_state; /* INVALID | VALID | DIRTY */ } __attribute__ ((aligned(32))); #else struct flash_cacheblock { sector_t dbn; /* Sector number of the cached block */ u_int32_t cache_state; /* INVALID | VALID | DIRTY */ } __attribute__ ((aligned(16))); #endif #define MD_BLOCK_BYTES(DMC) ((DMC)->md_block_size * 512) #define MD_SECTORS_PER_BLOCK(DMC) ((DMC)->md_block_size) #define MD_SLOTS_PER_BLOCK(DMC) (MD_BLOCK_BYTES(DMC) / (sizeof(struct flash_cacheblock))) #define INDEX_TO_MD_BLOCK(DMC, INDEX) ((INDEX) / MD_SLOTS_PER_BLOCK(DMC)) #define INDEX_TO_MD_BLOCK_OFFSET(DMC, INDEX) ((INDEX) % MD_SLOTS_PER_BLOCK(DMC)) #define METADATA_IO_BLOCKSIZE (256*1024) #define METADATA_IO_NUM_BLOCKS(dmc) (METADATA_IO_BLOCKSIZE / MD_BLOCK_BYTES(dmc)) #define INDEX_TO_CACHE_ADDR(DMC, INDEX) \ (((sector_t)(INDEX) << (DMC)->block_shift) + (DMC)->md_blocks * MD_SECTORS_PER_BLOCK((DMC))) #define CACHE_ADDR_TO_INDEX(DMC, CACHE_ADDR) \ ((int)(((CACHE_ADDR) - ((DMC)->md_blocks * MD_SECTORS_PER_BLOCK((DMC)))) >> (DMC)->block_shift)) #ifdef __KERNEL__ /* Cache persistence */ #define CACHE_RELOAD 1 #define CACHE_CREATE 2 #define CACHE_FORCECREATE 3 /* * We have one of these for *every* cache metadata sector, to keep track * of metadata ios in progress for blocks covered in this sector. Only * one metadata IO per sector can be in progress at any given point in * time */ struct cache_md_block_head { u_int32_t nr_in_prog; struct kcached_job *queued_updates, *md_io_inprog; spinlock_t md_block_lock; }; #define MIN_JOBS 1024 /* Default values for sysctls */ #define DIRTY_THRESH_MIN 10 #define DIRTY_THRESH_MAX 95 #define DIRTY_THRESH_DEF 20 #define MAX_CLEAN_IOS_SET 2 #define MAX_CLEAN_IOS_TOTAL 4 #define MAX_PIDS 100 #define PID_EXPIRY_SECS 60 #define FALLOW_DELAY (60*15) /* 15 Mins default */ #define FALLOW_SPEED_MIN 1 #define FALLOW_SPEED_MAX 100 #define FALLOW_CLEAN_SPEED 2 #define FLASHCACHE_LRU_HOT_PCT_DEFAULT 50 /* DM async IO mempool sizing */ #define FLASHCACHE_ASYNC_SIZE 1024 enum { FLASHCACHE_WHITELIST=0, FLASHCACHE_BLACKLIST=1, }; struct flashcache_cachectl_pid { pid_t pid; struct flashcache_cachectl_pid *next, *prev; unsigned long expiry; }; struct dbn_index_pair { sector_t dbn; int index; }; /* Error injection flags */ #define READDISK_ERROR 0x00000001 #define READCACHE_ERROR 0x00000002 #define READFILL_ERROR 0x00000004 #define WRITECACHE_ERROR 0x00000008 #define WRITECACHE_MD_ERROR 0x00000010 #define WRITEDISK_MD_ERROR 0x00000020 #define KCOPYD_CALLBACK_ERROR 0x00000040 #define DIRTY_WRITEBACK_JOB_ALLOC_FAIL 0x00000080 #define READ_MISS_JOB_ALLOC_FAIL 0x00000100 #define READ_HIT_JOB_ALLOC_FAIL 0x00000200 #define READ_HIT_PENDING_JOB_ALLOC_FAIL 0x00000400 #define INVAL_PENDING_JOB_ALLOC_FAIL 0x00000800 #define WRITE_HIT_JOB_ALLOC_FAIL 0x00001000 #define WRITE_HIT_PENDING_JOB_ALLOC_FAIL 0x00002000 #define WRITE_MISS_JOB_ALLOC_FAIL 0x00004000 #define WRITES_LIST_ALLOC_FAIL 0x00008000 #define MD_ALLOC_SECTOR_ERROR 0x00010000 /* Inject a 5s delay between syncing blocks and metadata */ #define FLASHCACHE_SYNC_REMOVE_DELAY 5000 #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) int flashcache_map(struct dm_target *ti, struct bio *bio, union map_info *map_context); #else int flashcache_map(struct dm_target *ti, struct bio *bio); #endif int flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv); void flashcache_dtr(struct dm_target *ti); struct kcached_job *flashcache_alloc_cache_job(void); void flashcache_free_cache_job(struct kcached_job *job); struct pending_job *flashcache_alloc_pending_job(struct cache_c *dmc); void flashcache_free_pending_job(struct pending_job *job); #ifdef FLASHCACHE_DO_CHECKSUMS u_int64_t flashcache_compute_checksum(struct bio *bio); void flashcache_store_checksum(struct kcached_job *job); int flashcache_validate_checksum(struct kcached_job *job); int flashcache_read_compute_checksum(struct cache_c *dmc, int index, void *block); #endif struct kcached_job *pop(struct list_head *jobs); void push(struct list_head *jobs, struct kcached_job *job); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) void do_work(void *unused); #else void do_work(struct work_struct *unused); #endif struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio, int index); void push_pending(struct kcached_job *job); void push_io(struct kcached_job *job); void push_md_io(struct kcached_job *job); void push_md_complete(struct kcached_job *job); void push_uncached_io_complete(struct kcached_job *job); int flashcache_pending_empty(void); int flashcache_io_empty(void); int flashcache_md_io_empty(void); int flashcache_md_complete_empty(void); void flashcache_md_write_done(struct kcached_job *job); void flashcache_do_pending(struct kcached_job *job); void flashcache_free_pending_jobs(struct cache_c *dmc, struct cacheblock *cacheblk, int error); void flashcache_md_write(struct kcached_job *job); void flashcache_md_write_kickoff(struct kcached_job *job); void flashcache_do_io(struct kcached_job *job); void flashcache_uncached_io_complete(struct kcached_job *job); void flashcache_clean_set(struct cache_c *dmc, int set, int force_clean_blocks); void flashcache_sync_all(struct cache_c *dmc); void flashcache_reclaim_fifo_get_old_block(struct cache_c *dmc, int start_index, int *index); void flashcache_reclaim_lru_get_old_block(struct cache_c *dmc, int start_index, int *index); void flashcache_reclaim_init_lru_lists(struct cache_c *dmc); void flashcache_lru_accessed(struct cache_c *dmc, int index); void flashcache_reclaim_rebalance_lru(struct cache_c *dmc, int new_lru_hot_pct); void flashcache_merge_writes(struct cache_c *dmc, struct dbn_index_pair *writes_list, struct dbn_index_pair *set_dirty_list, int *nr_writes, int set); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) int flashcache_dm_io_sync_vm(struct cache_c *dmc, struct io_region *where, int rw, void *data); #else int flashcache_dm_io_sync_vm(struct cache_c *dmc, struct dm_io_region *where, int rw, void *data); #endif void flashcache_update_sync_progress(struct cache_c *dmc); void flashcache_enq_pending(struct cache_c *dmc, struct bio* bio, int index, int action, struct pending_job *job); struct pending_job *flashcache_deq_pending(struct cache_c *dmc, int index); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) int dm_io_async_bvec(unsigned int num_regions, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) struct dm_io_region *where, #else struct io_region *where, #endif int rw, struct bio *bio, io_notify_fn fn, void *context); #endif void flashcache_detect_fallow(struct cache_c *dmc, int index); void flashcache_clear_fallow(struct cache_c *dmc, int index); void flashcache_bio_endio(struct bio *bio, int error, struct cache_c *dmc, struct timeval *io_start_time); /* procfs */ void flashcache_module_procfs_init(void); void flashcache_module_procfs_release(void); void flashcache_ctr_procfs(struct cache_c *dmc); void flashcache_dtr_procfs(struct cache_c *dmc); void flashcache_hash_init(struct cache_c *dmc); void flashcache_hash_destroy(struct cache_c *dmc); void flashcache_hash_remove(struct cache_c *dmc, int index); int flashcache_hash_lookup(struct cache_c *dmc, int set, sector_t dbn); void flashcache_hash_insert(struct cache_c *dmc, int index); void flashcache_invalid_insert(struct cache_c *dmc, int index); void flashcache_invalid_remove(struct cache_c *dmc, int index); int flashcache_invalid_get(struct cache_c *dmc, int set); int flashcache_diskclean_init(struct cache_c *dmc); void flashcache_diskclean_destroy(struct cache_c *dmc); int flashcache_diskclean_alloc(struct cache_c *dmc, struct dbn_index_pair **buf1, struct dbn_index_pair **buf2); void flashcache_diskclean_free(struct cache_c *dmc, struct dbn_index_pair *buf1, struct dbn_index_pair *buf2); unsigned long hash_block(struct cache_c *dmc, sector_t dbn); void flashcache_copy_data(struct cache_c *dmc, struct cache_set *cache_set, int nr_writes, struct dbn_index_pair *writes_list); void push_cleaning_read_complete(struct flashcache_copy_job *job); void push_cleaning_write_complete(struct flashcache_copy_job *job); void flashcache_clean_write_kickoff(struct flashcache_copy_job *job); void flashcache_clean_md_write_kickoff(struct flashcache_copy_job *job); int flashcache_kcopy_init(struct cache_c *dmc); void flashcache_kcopy_destroy(struct cache_c *dmc); #endif /* __KERNEL__ */ #endif flashcache-3.1.3+git20150701/src/flashcache.hook000077500000000000000000000005311254507146700207450ustar00rootroot00000000000000#!/bin/sh -e # mkinitramfs hook for flashcache PREREQ="mdadm" prereqs () { echo "$PREREQ" } case $1 in prereqs) prereqs exit 0 ;; esac . /usr/share/initramfs-tools/hook-functions manual_add_modules flashcache copy_exec /sbin/flashcache_load /sbin copy_exec /sbin/flashcache_create /sbin copy_exec /sbin/flashcache_destroy /sbin exit 0 flashcache-3.1.3+git20150701/src/flashcache_conf.c000066400000000000000000002001401254507146700212270ustar00rootroot00000000000000/**************************************************************************** * flashcache_conf.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" #include "flashcache_ioctl.h" struct cache_c *cache_list_head = NULL; struct work_struct _kcached_wq; u_int64_t size_hist[33]; struct kmem_cache *_job_cache; mempool_t *_job_pool; struct kmem_cache *_pending_job_cache; mempool_t *_pending_job_pool; atomic_t nr_cache_jobs; atomic_t nr_pending_jobs; extern struct list_head *_pending_jobs; extern struct list_head *_io_jobs; extern struct list_head *_md_io_jobs; extern struct list_head *_md_complete_jobs; struct flashcache_control_s { unsigned long synch_flags; }; struct flashcache_control_s *flashcache_control; /* Bit offsets for wait_on_bit_lock() */ #define FLASHCACHE_UPDATE_LIST 0 static int flashcache_notify_reboot(struct notifier_block *this, unsigned long code, void *x); static void flashcache_sync_for_remove(struct cache_c *dmc); extern char *flashcache_sw_version; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) static int flashcache_wait_schedule(void *unused) { schedule(); return 0; } #endif static int flashcache_jobs_init(void) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) _job_cache = kmem_cache_create("kcached-jobs", sizeof(struct kcached_job), __alignof__(struct kcached_job), 0, NULL, NULL); #else _job_cache = kmem_cache_create("kcached-jobs", sizeof(struct kcached_job), __alignof__(struct kcached_job), 0, NULL); #endif if (!_job_cache) return -ENOMEM; _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, mempool_free_slab, _job_cache); if (!_job_pool) { kmem_cache_destroy(_job_cache); return -ENOMEM; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) _pending_job_cache = kmem_cache_create("pending-jobs", sizeof(struct pending_job), __alignof__(struct pending_job), 0, NULL, NULL); #else _pending_job_cache = kmem_cache_create("pending-jobs", sizeof(struct pending_job), __alignof__(struct pending_job), 0, NULL); #endif if (!_pending_job_cache) { mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); return -ENOMEM; } _pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, mempool_free_slab, _pending_job_cache); if (!_pending_job_pool) { kmem_cache_destroy(_pending_job_cache); mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); return -ENOMEM; } return 0; } static void flashcache_jobs_exit(void) { VERIFY(flashcache_pending_empty()); VERIFY(flashcache_io_empty()); VERIFY(flashcache_md_io_empty()); VERIFY(flashcache_md_complete_empty()); mempool_destroy(_job_pool); kmem_cache_destroy(_job_cache); _job_pool = NULL; _job_cache = NULL; mempool_destroy(_pending_job_pool); kmem_cache_destroy(_pending_job_cache); _pending_job_pool = NULL; _pending_job_cache = NULL; } static int flashcache_kcached_init(struct cache_c *dmc) { init_waitqueue_head(&dmc->destroyq); atomic_set(&dmc->nr_jobs, 0); atomic_set(&dmc->remove_in_prog, 0); return 0; } /* * Write out the metadata one sector at a time. * Then dump out the superblock. */ static int flashcache_writeback_md_store(struct cache_c *dmc) { struct flash_cacheblock *meta_data_cacheblock, *next_ptr; struct flash_superblock *header; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region where; #else struct dm_io_region where; #endif int i, j; int num_valid = 0, num_dirty = 0; int error; int write_errors = 0; int sectors_written = 0, sectors_expected = 0; /* debug */ int slots_written = 0; /* How many cache slots did we fill in this MD io block ? */ meta_data_cacheblock = (struct flash_cacheblock *)vmalloc(METADATA_IO_BLOCKSIZE); if (!meta_data_cacheblock) { DMERR("flashcache_writeback_md_store: Unable to allocate memory"); DMERR("flashcache_writeback_md_store: Could not write out cache metadata !"); return 1; } where.bdev = dmc->cache_dev->bdev; where.sector = MD_SECTORS_PER_BLOCK(dmc); slots_written = 0; next_ptr = meta_data_cacheblock; j = MD_SLOTS_PER_BLOCK(dmc); for (i = 0 ; i < dmc->size ; i++) { if (dmc->cache[i].cache_state & VALID) num_valid++; if (dmc->cache[i].cache_state & DIRTY) num_dirty++; next_ptr->dbn = dmc->cache[i].dbn; #ifdef FLASHCACHE_DO_CHECKSUMS next_ptr->checksum = dmc->cache[i].checksum; #endif next_ptr->cache_state = dmc->cache[i].cache_state & (INVALID | VALID | DIRTY); next_ptr++; slots_written++; j--; if (j == 0) { /* * Filled the block, write and goto the next metadata block. */ if (slots_written == MD_SLOTS_PER_BLOCK(dmc) * METADATA_IO_NUM_BLOCKS(dmc)) { /* * Wrote out an entire metadata IO block, write the block to the ssd. */ where.count = (slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); slots_written = 0; sectors_written += where.count; /* debug */ error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, meta_data_cacheblock); if (error) { write_errors++; DMERR("flashcache_writeback_md_store: Could not write out cache metadata block %lu error %d !", where.sector, error); } where.sector += where.count; /* Advance offset */ } /* Move next slot pointer into next block */ next_ptr = (struct flash_cacheblock *) ((caddr_t)meta_data_cacheblock + ((slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_BLOCK_BYTES(dmc))); j = MD_SLOTS_PER_BLOCK(dmc); } } if (next_ptr != meta_data_cacheblock) { /* Write the remaining last blocks out */ VERIFY(slots_written > 0); where.count = (slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); if (slots_written % MD_SLOTS_PER_BLOCK(dmc)) where.count += MD_SECTORS_PER_BLOCK(dmc); sectors_written += where.count; error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, meta_data_cacheblock); if (error) { write_errors++; DMERR("flashcache_writeback_md_store: Could not write out cache metadata block %lu error %d !", where.sector, error); } } /* Debug Tests */ sectors_expected = (dmc->size / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); if (dmc->size % MD_SLOTS_PER_BLOCK(dmc)) sectors_expected += MD_SECTORS_PER_BLOCK(dmc); if (sectors_expected != sectors_written) { printk("flashcache_writeback_md_store" "Sector Mismatch ! sectors_expected=%d, sectors_written=%d\n", sectors_expected, sectors_written); panic("flashcache_writeback_md_store: sector mismatch\n"); } vfree((void *)meta_data_cacheblock); header = (struct flash_superblock *)vmalloc(MD_BLOCK_BYTES(dmc)); if (!header) { DMERR("flashcache_writeback_md_store: Unable to allocate memory"); DMERR("flashcache_writeback_md_store: Could not write out cache metadata !"); return 1; } memset(header, 0, MD_BLOCK_BYTES(dmc)); /* Write the header out last */ if (write_errors == 0) { if (num_dirty == 0) header->cache_sb_state = CACHE_MD_STATE_CLEAN; else header->cache_sb_state = CACHE_MD_STATE_FASTCLEAN; } else header->cache_sb_state = CACHE_MD_STATE_UNSTABLE; header->block_size = dmc->block_size; header->md_block_size = dmc->md_block_size; header->size = dmc->size; header->assoc = dmc->assoc; header->disk_assoc = dmc->disk_assoc; strncpy(header->disk_devname, dmc->disk_devname, DEV_PATHLEN); strncpy(header->cache_devname, dmc->dm_vdevname, DEV_PATHLEN); header->cache_devsize = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); header->disk_devsize = to_sector(dmc->disk_dev->bdev->bd_inode->i_size); header->cache_version = dmc->on_ssd_version; header->write_only_cache = dmc->write_only_cache; DPRINTK("Store metadata to disk: block size(%u), md block size(%u), cache size(%llu)" \ "associativity(%u)", header->block_size, header->md_block_size, header->size, header->assoc); where.sector = 0; where.count = dmc->md_block_size; error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, header); if (error) { write_errors++; DMERR("flashcache_writeback_md_store: Could not write out cache metadata superblock %lu error %d !", where.sector, error); } vfree((void *)header); if (write_errors == 0) DMINFO("Cache metadata saved to disk"); else { DMINFO("CRITICAL : There were %d errors in saving cache metadata saved to disk", write_errors); if (num_dirty) DMINFO("CRITICAL : You have likely lost %d dirty blocks", num_dirty); } DMINFO("flashcache_writeback_md_store: valid blocks = %d dirty blocks = %d md_sectors = %d\n", num_valid, num_dirty, dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc)); return 0; } static int flashcache_writethrough_create(struct cache_c *dmc) { sector_t cache_size, dev_size; sector_t order; int i; /* * Convert size (in sectors) to blocks. * Then round size (in blocks now) down to a multiple of associativity */ dmc->size /= dmc->block_size; dmc->size = (dmc->size / dmc->assoc) * dmc->assoc; /* Check cache size against device size */ dev_size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); cache_size = dmc->size * dmc->block_size; if (cache_size > dev_size) { DMERR("Requested cache size exeeds the cache device's capacity" \ "(%lu>%lu)", cache_size, dev_size); return 1; } order = dmc->size * sizeof(struct cacheblock); DMINFO("Allocate %luKB (%luB per) mem for %lu-entry cache" \ "(capacity:%luMB, associativity:%u, block size:%u " \ "sectors(%uKB))", order >> 10, sizeof(struct cacheblock), dmc->size, cache_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT)); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { DMERR("flashcache_writethrough_create: Unable to allocate cache md"); return 1; } memset(dmc->cache, 0, order); /* Initialize the cache structs */ for (i = 0; i < dmc->size ; i++) { dmc->cache[i].dbn = 0; #ifdef FLASHCACHE_DO_CHECKSUMS dmc->cache[i].checksum = 0; #endif dmc->cache[i].cache_state = INVALID; dmc->cache[i].lru_state = 0; dmc->cache[i].nr_queued = 0; } dmc->md_blocks = 0; return 0; } static int flashcache_writeback_create(struct cache_c *dmc, int force) { struct flash_cacheblock *meta_data_cacheblock, *next_ptr; struct flash_superblock *header; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region where; #else struct dm_io_region where; #endif int i, j, error; sector_t cache_size, dev_size; sector_t order; int sectors_written = 0, sectors_expected = 0; /* debug */ int slots_written = 0; /* How many cache slots did we fill in this MD io block ? */ header = (struct flash_superblock *)vmalloc(MD_BLOCK_BYTES(dmc)); if (!header) { DMERR("flashcache_writeback_create: Unable to allocate sector"); return 1; } where.bdev = dmc->cache_dev->bdev; where.sector = 0; where.count = dmc->md_block_size; error = flashcache_dm_io_sync_vm(dmc, &where, READ, header); if (error) { vfree((void *)header); DMERR("flashcache_writeback_create: Could not read cache superblock %lu error %d !", where.sector, error); return 1; } if (!force && ((header->cache_sb_state == CACHE_MD_STATE_DIRTY) || (header->cache_sb_state == CACHE_MD_STATE_CLEAN) || (header->cache_sb_state == CACHE_MD_STATE_FASTCLEAN))) { vfree((void *)header); DMERR("flashcache_writeback_create: Existing Cache Detected, use force to re-create"); return 1; } /* Compute the size of the metadata, including header. Note dmc->size is in raw sectors */ dmc->md_blocks = INDEX_TO_MD_BLOCK(dmc, dmc->size / dmc->block_size) + 1 + 1; dmc->size -= dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc); /* total sectors available for cache */ dmc->size /= dmc->block_size; dmc->size = (dmc->size / dmc->assoc) * dmc->assoc; /* Recompute since dmc->size was possibly trunc'ed down */ dmc->md_blocks = INDEX_TO_MD_BLOCK(dmc, dmc->size) + 1 + 1; DMINFO("flashcache_writeback_create: md_blocks = %d, md_sectors = %d\n", dmc->md_blocks, dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc)); dev_size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); cache_size = dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc) + (dmc->size * dmc->block_size); if (cache_size > dev_size) { DMERR("Requested cache size exceeds the cache device's capacity" \ "(%lu>%lu)", cache_size, dev_size); vfree((void *)header); return 1; } order = dmc->size * sizeof(struct cacheblock); DMINFO("Allocate %luKB (%luB per) mem for %lu-entry cache" \ "(capacity:%luMB, associativity:%u, block size:%u " \ "sectors(%uKB))", order >> 10, sizeof(struct cacheblock), dmc->size, cache_size >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT)); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { vfree((void *)header); DMERR("flashcache_writeback_create: Unable to allocate cache md"); return 1; } memset(dmc->cache, 0, order); /* Initialize the cache structs */ for (i = 0; i < dmc->size ; i++) { dmc->cache[i].dbn = 0; #ifdef FLASHCACHE_DO_CHECKSUMS dmc->cache[i].checksum = 0; #endif dmc->cache[i].cache_state = INVALID; dmc->cache[i].lru_state = 0; dmc->cache[i].nr_queued = 0; } meta_data_cacheblock = (struct flash_cacheblock *)vmalloc(METADATA_IO_BLOCKSIZE); if (!meta_data_cacheblock) { DMERR("flashcache_writeback_create: Unable to allocate memory"); DMERR("flashcache_writeback_create: Could not write out cache metadata !"); return 1; } where.sector = MD_SECTORS_PER_BLOCK(dmc); slots_written = 0; next_ptr = meta_data_cacheblock; j = MD_SLOTS_PER_BLOCK(dmc); for (i = 0 ; i < dmc->size ; i++) { next_ptr->dbn = dmc->cache[i].dbn; #ifdef FLASHCACHE_DO_CHECKSUMS next_ptr->checksum = dmc->cache[i].checksum; #endif next_ptr->cache_state = dmc->cache[i].cache_state & (INVALID | VALID | DIRTY); next_ptr++; slots_written++; j--; if (j == 0) { /* * Filled the block, write and goto the next metadata block. */ if (slots_written == MD_SLOTS_PER_BLOCK(dmc) * METADATA_IO_NUM_BLOCKS(dmc)) { /* * Wrote out an entire metadata IO block, write the block to the ssd. */ where.count = (slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); slots_written = 0; sectors_written += where.count; /* debug */ error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, meta_data_cacheblock); if (error) { vfree((void *)header); vfree((void *)meta_data_cacheblock); vfree(dmc->cache); DMERR("flashcache_writeback_create: Could not write cache metadata block %lu error %d !", where.sector, error); return 1; } where.sector += where.count; /* Advance offset */ } /* Move next slot pointer into next metadata block */ next_ptr = (struct flash_cacheblock *) ((caddr_t)meta_data_cacheblock + ((slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_BLOCK_BYTES(dmc))); j = MD_SLOTS_PER_BLOCK(dmc); } } if (next_ptr != meta_data_cacheblock) { /* Write the remaining last blocks out */ VERIFY(slots_written > 0); where.count = (slots_written / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); if (slots_written % MD_SLOTS_PER_BLOCK(dmc)) where.count += MD_SECTORS_PER_BLOCK(dmc); sectors_written += where.count; error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, meta_data_cacheblock); if (error) { vfree((void *)header); vfree((void *)meta_data_cacheblock); vfree(dmc->cache); DMERR("flashcache_writeback_create: Could not write cache metadata block %lu error %d !", where.sector, error); return 1; } } /* Debug Tests */ sectors_expected = (dmc->size / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); if (dmc->size % MD_SLOTS_PER_BLOCK(dmc)) sectors_expected += MD_SECTORS_PER_BLOCK(dmc); if (sectors_expected != sectors_written) { printk("flashcache_writeback_create" "Sector Mismatch ! sectors_expected=%d, sectors_written=%d\n", sectors_expected, sectors_written); panic("flashcache_writeback_create: sector mismatch\n"); } vfree((void *)meta_data_cacheblock); /* Write the header */ header->cache_sb_state = CACHE_MD_STATE_DIRTY; header->block_size = dmc->block_size; header->md_block_size = dmc->md_block_size; header->size = dmc->size; header->assoc = dmc->assoc; header->disk_assoc = dmc->disk_assoc; strncpy(header->disk_devname, dmc->disk_devname, DEV_PATHLEN); strncpy(header->cache_devname, dmc->dm_vdevname, DEV_PATHLEN); header->cache_devsize = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); header->disk_devsize = to_sector(dmc->disk_dev->bdev->bd_inode->i_size); dmc->on_ssd_version = header->cache_version = FLASHCACHE_VERSION; header->write_only_cache = dmc->write_only_cache; where.sector = 0; where.count = dmc->md_block_size; printk("flashcache-dbg: cachedev check - %s %s", header->cache_devname, dmc->dm_vdevname); error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, header); if (error) { vfree((void *)header); vfree(dmc->cache); DMERR("flashcache_writeback_create: Could not write cache superblock %lu error %d !", where.sector, error); return 1; } vfree((void *)header); return 0; } static int flashcache_writeback_load(struct cache_c *dmc) { struct flash_cacheblock *meta_data_cacheblock, *next_ptr; struct flash_superblock *header; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region where; #else struct dm_io_region where; #endif int i, j; u_int64_t size, slots_read; int clean_shutdown; int dirty_loaded = 0; sector_t order, data_size; int num_valid = 0; int error; int sectors_read = 0, sectors_expected = 0; /* Debug */ /* * We don't know what the preferred block size is, just read off * the default md blocksize. */ header = (struct flash_superblock *)vmalloc(DEFAULT_MD_BLOCK_SIZE_BYTES); if (!header) { DMERR("flashcache_writeback_load: Unable to allocate memory"); return 1; } where.bdev = dmc->cache_dev->bdev; where.sector = 0; where.count = DEFAULT_MD_BLOCK_SIZE; error = flashcache_dm_io_sync_vm(dmc, &where, READ, header); if (error) { vfree((void *)header); DMERR("flashcache_writeback_load: Could not read cache superblock %lu error %d!", where.sector, error); return 1; } if (header->cache_version == 1) { /* Backwards compatibility, md was 512 bytes always in V1.0 */ header->md_block_size = 1; } else if (header->cache_version > FLASHCACHE_VERSION) { vfree((void *)header); DMERR("flashcache_writeback_load: Unknown version %d found in superblock!", header->cache_version); return 1; } dmc->disk_assoc = header->disk_assoc; dmc->write_only_cache = header->write_only_cache; if (header->cache_version < 3) /* Disk Assoc was introduced in On SSD version 3 */ dmc->disk_assoc = 0; if (dmc->disk_assoc != 0) dmc->disk_assoc_shift = ffs(dmc->disk_assoc) - 1; if (header->cache_version < 4) /* write_only_cache was introduced in On SSD version 4 */ dmc->write_only_cache = 0; dmc->on_ssd_version = header->cache_version; DPRINTK("Loaded cache conf: version(%d), block size(%u), md block size(%u), cache size(%llu), " \ "associativity(%u)", header->cache_version, header->block_size, header->md_block_size, header->size, header->assoc); if (!((header->cache_sb_state == CACHE_MD_STATE_DIRTY) || (header->cache_sb_state == CACHE_MD_STATE_CLEAN) || (header->cache_sb_state == CACHE_MD_STATE_FASTCLEAN))) { vfree((void *)header); DMERR("flashcache_writeback_load: Corrupt Cache Superblock"); return 1; } if (header->cache_sb_state == CACHE_MD_STATE_DIRTY) { DMINFO("Unclean Shutdown Detected"); printk(KERN_ALERT "Only DIRTY blocks exist in cache"); clean_shutdown = 0; } else if (header->cache_sb_state == CACHE_MD_STATE_CLEAN) { DMINFO("Slow (clean) Shutdown Detected"); printk(KERN_ALERT "Only CLEAN blocks exist in cache"); clean_shutdown = 1; } else { DMINFO("Fast (clean) Shutdown Detected"); printk(KERN_ALERT "Both CLEAN and DIRTY blocks exist in cache"); clean_shutdown = 1; } dmc->block_size = header->block_size; dmc->md_block_size = header->md_block_size; dmc->block_shift = ffs(dmc->block_size) - 1; dmc->block_mask = dmc->block_size - 1; dmc->size = header->size; dmc->assoc = header->assoc; dmc->assoc_shift = ffs(dmc->assoc) - 1; dmc->md_blocks = INDEX_TO_MD_BLOCK(dmc, dmc->size) + 1 + 1; DMINFO("flashcache_writeback_load: md_blocks = %d, md_sectors = %d, md_block_size = %d\n", dmc->md_blocks, dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc), dmc->md_block_size); data_size = dmc->size * dmc->block_size; order = dmc->size * sizeof(struct cacheblock); DMINFO("Allocate %luKB (%ldB per) mem for %lu-entry cache" \ "(capacity:%luMB, associativity:%u, block size:%u " \ "sectors(%uKB))", order >> 10, sizeof(struct cacheblock), dmc->size, (dmc->md_blocks * MD_SECTORS_PER_BLOCK(dmc) + data_size) >> (20-SECTOR_SHIFT), dmc->assoc, dmc->block_size, dmc->block_size >> (10-SECTOR_SHIFT)); dmc->cache = (struct cacheblock *)vmalloc(order); if (!dmc->cache) { DMERR("load_metadata: Unable to allocate memory"); vfree((void *)header); return 1; } memset(dmc->cache, 0, order); /* Read the metadata in large blocks and populate incore state */ meta_data_cacheblock = (struct flash_cacheblock *)vmalloc(METADATA_IO_BLOCKSIZE); if (!meta_data_cacheblock) { vfree((void *)header); vfree(dmc->cache); DMERR("flashcache_writeback_load: Unable to allocate memory"); return 1; } where.sector = MD_SECTORS_PER_BLOCK(dmc); size = dmc->size; i = 0; while (size > 0) { slots_read = min(size, (u_int64_t)(MD_SLOTS_PER_BLOCK(dmc) * METADATA_IO_NUM_BLOCKS(dmc))); if (slots_read % MD_SLOTS_PER_BLOCK(dmc)) where.count = (1 + (slots_read / MD_SLOTS_PER_BLOCK(dmc))) * MD_SECTORS_PER_BLOCK(dmc); else where.count = (slots_read / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); sectors_read += where.count; /* Debug */ error = flashcache_dm_io_sync_vm(dmc, &where, READ, meta_data_cacheblock); if (error) { vfree((void *)header); vfree(dmc->cache); vfree((void *)meta_data_cacheblock); DMERR("flashcache_writeback_load: Could not read cache metadata block %lu error %d !", where.sector, error); return 1; } where.sector += where.count; next_ptr = meta_data_cacheblock; for (j = 0 ; j < slots_read ; j++) { /* * XXX - Now that we force each on-ssd metadata cache slot to be a ^2, where * we are guaranteed that the slots will exactly fit within a sector (and * a metadata block), we can simplify this logic. We don't need this next test. */ if ((j % MD_SLOTS_PER_BLOCK(dmc)) == 0) { /* Move onto next block */ next_ptr = (struct flash_cacheblock *) ((caddr_t)meta_data_cacheblock + MD_BLOCK_BYTES(dmc) * (j / MD_SLOTS_PER_BLOCK(dmc))); } dmc->cache[i].nr_queued = 0; /* * If unclean shutdown, only the DIRTY blocks are loaded. */ if (clean_shutdown || (next_ptr->cache_state & DIRTY)) { if (next_ptr->cache_state & DIRTY) dirty_loaded++; dmc->cache[i].cache_state = next_ptr->cache_state; VERIFY((dmc->cache[i].cache_state & (VALID | INVALID)) != (VALID | INVALID)); if (dmc->cache[i].cache_state & VALID) num_valid++; dmc->cache[i].dbn = next_ptr->dbn; #ifdef FLASHCACHE_DO_CHECKSUMS if (clean_shutdown) dmc->cache[i].checksum = next_ptr->checksum; else { error = flashcache_read_compute_checksum(dmc, i, block); if (error) { vfree((void *)header); vfree(dmc->cache); vfree((void *)meta_data_cacheblock); DMERR("flashcache_writeback_load: Could not read cache metadata block %lu error %d !", dmc->cache[i].dbn, error); return 1; } } #endif } else { dmc->cache[i].cache_state = INVALID; dmc->cache[i].dbn = 0; #ifdef FLASHCACHE_DO_CHECKSUMS dmc->cache[i].checksum = 0; #endif } next_ptr++; i++; } size -= slots_read; } /* Debug Tests */ sectors_expected = (dmc->size / MD_SLOTS_PER_BLOCK(dmc)) * MD_SECTORS_PER_BLOCK(dmc); if (dmc->size % MD_SLOTS_PER_BLOCK(dmc)) sectors_expected += MD_SECTORS_PER_BLOCK(dmc); if (sectors_expected != sectors_read) { printk("flashcache_writeback_load" "Sector Mismatch ! sectors_expected=%d, sectors_read=%d\n", sectors_expected, sectors_read); panic("flashcache_writeback_load: sector mismatch\n"); } vfree((void *)meta_data_cacheblock); /* * For writing the superblock out, use the preferred blocksize that * we read from the superblock above. */ if (DEFAULT_MD_BLOCK_SIZE != dmc->md_block_size) { vfree((void *)header); header = (struct flash_superblock *)vmalloc(MD_BLOCK_BYTES(dmc)); if (!header) { DMERR("flashcache_writeback_load: Unable to allocate memory"); return 1; } } /* Before we finish loading, we need to dirty the superblock and write it out */ header->size = dmc->size; header->block_size = dmc->block_size; header->md_block_size = dmc->md_block_size; header->assoc = dmc->assoc; header->disk_assoc = dmc->disk_assoc; header->cache_sb_state = CACHE_MD_STATE_DIRTY; strncpy(header->disk_devname, dmc->disk_devname, DEV_PATHLEN); strncpy(header->cache_devname, dmc->dm_vdevname, DEV_PATHLEN); header->cache_devsize = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); header->disk_devsize = to_sector(dmc->disk_dev->bdev->bd_inode->i_size); header->cache_version = dmc->on_ssd_version; where.sector = 0; where.count = dmc->md_block_size; error = flashcache_dm_io_sync_vm(dmc, &where, WRITE, header); if (error) { vfree((void *)header); vfree(dmc->cache); DMERR("flashcache_writeback_load: Could not write cache superblock %lu error %d !", where.sector, error); return 1; } vfree((void *)header); DMINFO("flashcache_writeback_load: Cache metadata loaded from disk with %d valid %d DIRTY blocks", num_valid, dirty_loaded); return 0; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) static void flashcache_clean_all_sets(void *data) { struct cache_c *dmc = (struct cache_c *)data; #else static void flashcache_clean_all_sets(struct work_struct *work) { struct cache_c *dmc = container_of(work, struct cache_c, delayed_clean.work); #endif int i; for (i = 0 ; i < dmc->num_sets ; i++) flashcache_clean_set(dmc, i, 0); } static int inline flashcache_get_dev(struct dm_target *ti, char *pth, struct dm_dev **dmd, char *dmc_dname, sector_t tilen) { int rc; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34) rc = dm_get_device(ti, pth, dm_table_get_mode(ti->table), dmd); #else #if defined(RHEL_MAJOR) && RHEL_MAJOR == 6 rc = dm_get_device(ti, pth, dm_table_get_mode(ti->table), dmd); #else rc = dm_get_device(ti, pth, 0, tilen, dm_table_get_mode(ti->table), dmd); #endif #endif if (!rc) strncpy(dmc_dname, pth, DEV_PATHLEN); return rc; } /* * Construct a cache mapping. * arg[0]: path to source device * arg[1]: path to cache device * arg[2]: md virtual device name * arg[3]: cache mode (from flashcache.h) * arg[4]: cache persistence (if set, cache conf is loaded from disk) * Cache configuration parameters (if not set, default values are used. * arg[5]: cache block size (in sectors) * arg[6]: cache size (in blocks) * arg[7]: cache associativity * arg[8]: md block size (in sectors) */ int flashcache_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct cache_c *dmc; sector_t i, order; int r = -EINVAL; int persistence = 0; if (argc < 3) { ti->error = "flashcache: Need at least 3 arguments"; goto bad; } dmc = kzalloc(sizeof(*dmc), GFP_KERNEL); if (dmc == NULL) { ti->error = "flashcache: Failed to allocate cache context"; r = ENOMEM; goto bad; } dmc->tgt = ti; if ((r = flashcache_get_dev(ti, argv[0], &dmc->disk_dev, dmc->disk_devname, ti->len))) { if (r == -EBUSY) ti->error = "flashcache: Disk device is busy, cannot create cache"; else ti->error = "flashcache: Disk device lookup failed"; goto bad1; } if ((r = flashcache_get_dev(ti, argv[1], &dmc->cache_dev, dmc->cache_devname, 0))) { if (r == -EBUSY) ti->error = "flashcache: Cache device is busy, cannot create cache"; else ti->error = "flashcache: Cache device lookup failed"; goto bad2; } if (sscanf(argv[2], "%s", (char *)&dmc->dm_vdevname) != 1) { ti->error = "flashcache: Virtual device name lookup failed"; goto bad3; } r = flashcache_kcached_init(dmc); if (r) { ti->error = "Failed to initialize kcached"; goto bad3; } if (sscanf(argv[3], "%u", &dmc->cache_mode) != 1) { ti->error = "flashcache: sscanf failed, invalid cache mode"; r = -EINVAL; goto bad3; } if (dmc->cache_mode < FLASHCACHE_WRITE_BACK || dmc->cache_mode > FLASHCACHE_WRITE_AROUND) { DMERR("cache_mode = %d", dmc->cache_mode); ti->error = "flashcache: Invalid cache mode"; r = -EINVAL; goto bad3; } /* * XXX - Persistence is totally ignored for write through and write around. * Maybe this should really be moved to the end of the param list ? */ if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { if (argc >= 5) { if (sscanf(argv[4], "%u", &persistence) != 1) { ti->error = "flashcache: sscanf failed, invalid cache persistence"; r = -EINVAL; goto bad3; } if (persistence < CACHE_RELOAD || persistence > CACHE_FORCECREATE) { DMERR("persistence = %d", persistence); ti->error = "flashcache: Invalid cache persistence"; r = -EINVAL; goto bad3; } } if (persistence == CACHE_RELOAD) { if (flashcache_writeback_load(dmc)) { ti->error = "flashcache: Cache reload failed"; r = -EINVAL; goto bad3; } goto init; /* Skip reading cache parameters from command line */ } } else persistence = CACHE_CREATE; if (argc >= 6) { if (sscanf(argv[5], "%u", &dmc->block_size) != 1) { ti->error = "flashcache: Invalid block size"; r = -EINVAL; goto bad3; } if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) { ti->error = "flashcache: Invalid block size"; r = -EINVAL; goto bad3; } } if (!dmc->block_size) dmc->block_size = DEFAULT_BLOCK_SIZE; dmc->block_shift = ffs(dmc->block_size) - 1; dmc->block_mask = dmc->block_size - 1; /* dmc->size is specified in sectors here, and converted to blocks later */ if (argc >= 7) { if (sscanf(argv[6], "%lu", &dmc->size) != 1) { ti->error = "flashcache: Invalid cache size"; r = -EINVAL; goto bad3; } } if (!dmc->size) dmc->size = to_sector(dmc->cache_dev->bdev->bd_inode->i_size); if (argc >= 8) { if (sscanf(argv[7], "%u", &dmc->assoc) != 1) { ti->error = "flashcache: Invalid cache associativity"; r = -EINVAL; goto bad3; } if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) || dmc->assoc > FLASHCACHE_MAX_ASSOC || dmc->assoc < FLASHCACHE_MIN_ASSOC || dmc->size < dmc->assoc) { ti->error = "flashcache: Invalid cache associativity"; r = -EINVAL; goto bad3; } } if (!dmc->assoc) dmc->assoc = DEFAULT_CACHE_ASSOC; dmc->assoc_shift = ffs(dmc->assoc) - 1; if (argc >= 9) { if (sscanf(argv[8], "%u", &dmc->disk_assoc) != 1) { ti->error = "flashcache: Invalid disk associativity"; r = -EINVAL; goto bad3; } /* disk_assoc of 0 is permitted value */ if ((dmc->disk_assoc > 0) && ((!dmc->disk_assoc || (dmc->disk_assoc & (dmc->disk_assoc - 1)) || dmc->disk_assoc > FLASHCACHE_MAX_DISK_ASSOC || dmc->disk_assoc < FLASHCACHE_MIN_DISK_ASSOC || dmc->size < dmc->disk_assoc || (dmc->assoc * dmc->block_shift) < dmc->disk_assoc))) { printk(KERN_ERR "Invalid Disk Assoc assoc %d disk_assoc %d size %ld\n", dmc->assoc, dmc->disk_assoc, dmc->size); ti->error = "flashcache: Invalid disk associativity"; r = -EINVAL; goto bad3; } } if (dmc->disk_assoc != 0) dmc->disk_assoc_shift = ffs(dmc->disk_assoc) - 1; if (argc >= 10) { if (sscanf(argv[9], "%u", &dmc->write_only_cache) != 1) { ti->error = "flashcache: Invalid Write Cache setting"; r = -EINVAL; goto bad3; } if ((dmc->write_only_cache == 1) && (dmc->cache_mode != FLASHCACHE_WRITE_BACK)) { printk(KERN_ERR "Write Cache Setting only valid with WRITE_BACK %d\n", dmc->write_only_cache); ti->error = "flashcache: Invalid Write Cache Setting"; r = -EINVAL; goto bad3; } if (dmc->write_only_cache < 0 || dmc->write_only_cache > 1) { printk(KERN_ERR "Invalid Write Cache Setting %d\n", dmc->write_only_cache); ti->error = "flashcache: Invalid Write Cache Setting"; r = -EINVAL; goto bad3; } } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { if (argc >= 11) { if (sscanf(argv[10], "%u", &dmc->md_block_size) != 1) { ti->error = "flashcache: Invalid metadata block size"; r = -EINVAL; goto bad3; } if (!dmc->md_block_size || (dmc->md_block_size & (dmc->md_block_size - 1)) || dmc->md_block_size > FLASHCACHE_MAX_MD_BLOCK_SIZE) { ti->error = "flashcache: Invalid metadata block size"; r = -EINVAL; goto bad3; } if (dmc->assoc < (dmc->md_block_size * 512 / sizeof(struct flash_cacheblock))) { ti->error = "flashcache: Please choose a smaller metadata block size or larger assoc"; r = -EINVAL; goto bad3; } } if (!dmc->md_block_size) dmc->md_block_size = DEFAULT_MD_BLOCK_SIZE; if (dmc->md_block_size * 512 < dmc->cache_dev->bdev->bd_block_size) { ti->error = "flashcache: Metadata block size must be >= cache device sector size"; r = -EINVAL; goto bad3; } } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { if (persistence == CACHE_CREATE) { if (flashcache_writeback_create(dmc, 0)) { ti->error = "flashcache: Cache Create Failed"; r = -EINVAL; goto bad3; } } else { if (flashcache_writeback_create(dmc, 1)) { ti->error = "flashcache: Cache Force Create Failed"; r = -EINVAL; goto bad3; } } } else flashcache_writethrough_create(dmc); init: dmc->num_sets = dmc->size >> dmc->assoc_shift; order = dmc->num_sets * sizeof(struct cache_set); dmc->cache_sets = (struct cache_set *)vmalloc(order); if (!dmc->cache_sets) { ti->error = "Unable to allocate memory"; r = -ENOMEM; vfree((void *)dmc->cache); goto bad3; } memset(dmc->cache_sets, 0, order); for (i = 0 ; i < dmc->num_sets ; i++) { dmc->cache_sets[i].set_fifo_next = i * dmc->assoc; dmc->cache_sets[i].set_clean_next = i * dmc->assoc; dmc->cache_sets[i].fallow_tstamp = jiffies; dmc->cache_sets[i].fallow_next_cleaning = jiffies; dmc->cache_sets[i].hotlist_lru_tail = FLASHCACHE_NULL; dmc->cache_sets[i].hotlist_lru_head = FLASHCACHE_NULL; dmc->cache_sets[i].warmlist_lru_tail = FLASHCACHE_NULL; dmc->cache_sets[i].warmlist_lru_head = FLASHCACHE_NULL; spin_lock_init(&dmc->cache_sets[i].set_spin_lock); } atomic_set(&dmc->hot_list_pct, FLASHCACHE_LRU_HOT_PCT_DEFAULT); flashcache_reclaim_init_lru_lists(dmc); flashcache_hash_init(dmc); if (flashcache_diskclean_init(dmc)) { ti->error = "Unable to allocate memory"; r = -ENOMEM; vfree((void *)dmc->cache); vfree((void *)dmc->cache_sets); goto bad3; } if (flashcache_kcopy_init(dmc)) { ti->error = "Unable to allocate memory"; r = -ENOMEM; flashcache_diskclean_destroy(dmc); vfree((void *)dmc->cache); vfree((void *)dmc->cache_sets); goto bad3; } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { order = (dmc->md_blocks - 1) * sizeof(struct cache_md_block_head); dmc->md_blocks_buf = (struct cache_md_block_head *)vmalloc(order); if (!dmc->md_blocks_buf) { ti->error = "Unable to allocate memory"; r = -ENOMEM; flashcache_kcopy_destroy(dmc); flashcache_diskclean_destroy(dmc); vfree((void *)dmc->cache); vfree((void *)dmc->cache_sets); goto bad3; } for (i = 0 ; i < dmc->md_blocks - 1 ; i++) { dmc->md_blocks_buf[i].nr_in_prog = 0; dmc->md_blocks_buf[i].queued_updates = NULL; dmc->md_blocks_buf[i].md_io_inprog = NULL; spin_lock_init(&dmc->md_blocks_buf[i].md_block_lock); } } atomic_set(&dmc->sync_index, 0); atomic_set(&dmc->clean_inprog, 0); atomic_set(&dmc->nr_dirty, 0); atomic_set(&dmc->cached_blocks, 0); atomic_set(&dmc->pending_jobs_count, 0); spin_lock_init(&dmc->ioctl_lock); spin_lock_init(&dmc->cache_pending_q_spinlock); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ti->split_io = dmc->block_size; #else ti->max_io_len = dmc->block_size; #endif ti->private = dmc; /* Cleaning Thresholds */ dmc->sysctl_dirty_thresh = DIRTY_THRESH_DEF; dmc->dirty_thresh_set = (dmc->assoc * dmc->sysctl_dirty_thresh) / 100; dmc->max_clean_ios_total = MAX_CLEAN_IOS_TOTAL; dmc->max_clean_ios_set = MAX_CLEAN_IOS_SET; /* Other sysctl defaults */ dmc->sysctl_io_latency_hist = 0; dmc->sysctl_do_sync = 0; dmc->sysctl_stop_sync = 0; dmc->sysctl_pid_do_expiry = 0; dmc->sysctl_max_pids = MAX_PIDS; dmc->sysctl_pid_expiry_secs = PID_EXPIRY_SECS; dmc->sysctl_reclaim_policy = FLASHCACHE_FIFO; dmc->sysctl_zerostats = 0; dmc->sysctl_error_inject = 0; dmc->sysctl_fast_remove = 0; dmc->sysctl_cache_all = 1; dmc->sysctl_fallow_clean_speed = FALLOW_CLEAN_SPEED; if (dmc->write_only_cache == 0) /* Don't both fallow cleaning for write only caching */ dmc->sysctl_fallow_delay = FALLOW_DELAY; dmc->sysctl_skip_seq_thresh_kb = SKIP_SEQUENTIAL_THRESHOLD; dmc->sysctl_clean_on_read_miss = 0; dmc->sysctl_clean_on_write_miss = 0; dmc->sysctl_lru_hot_pct = 75; dmc->sysctl_lru_promote_thresh = 2; dmc->sysctl_new_style_write_merge = 0; /* Sequential i/o spotting */ for (i = 0; i < SEQUENTIAL_TRACKER_QUEUE_DEPTH; i++) { dmc->seq_recent_ios[i].most_recent_sector = 0; dmc->seq_recent_ios[i].sequential_count = 0; dmc->seq_recent_ios[i].prev = (struct sequential_io *)NULL; dmc->seq_recent_ios[i].next = (struct sequential_io *)NULL; seq_io_move_to_lruhead(dmc, &dmc->seq_recent_ios[i]); } dmc->seq_io_tail = &dmc->seq_recent_ios[0]; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, flashcache_wait_schedule, TASK_UNINTERRUPTIBLE); #else (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, TASK_UNINTERRUPTIBLE); #endif dmc->next_cache = cache_list_head; cache_list_head = dmc; clear_bit(FLASHCACHE_UPDATE_LIST, &flashcache_control->synch_flags); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) smp_mb__after_clear_bit(); #else smp_mb__after_atomic(); #endif wake_up_bit(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST); for (i = 0 ; i < dmc->size ; i++) { dmc->cache[i].hash_prev = FLASHCACHE_NULL; dmc->cache[i].hash_next = FLASHCACHE_NULL; if (dmc->cache[i].cache_state & VALID) { flashcache_hash_insert(dmc, i); atomic_inc(&dmc->cached_blocks); } if (dmc->cache[i].cache_state & DIRTY) { dmc->cache_sets[i / dmc->assoc].nr_dirty++; atomic_inc(&dmc->nr_dirty); } if (dmc->cache[i].cache_state & INVALID) flashcache_invalid_insert(dmc, i); } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) INIT_WORK(&dmc->delayed_clean, flashcache_clean_all_sets, dmc); #else INIT_DELAYED_WORK(&dmc->delayed_clean, flashcache_clean_all_sets); #endif dmc->whitelist_head = NULL; dmc->whitelist_tail = NULL; dmc->blacklist_head = NULL; dmc->blacklist_tail = NULL; dmc->num_whitelist_pids = 0; dmc->num_blacklist_pids = 0; flashcache_ctr_procfs(dmc); return 0; bad3: dm_put_device(ti, dmc->cache_dev); bad2: dm_put_device(ti, dmc->disk_dev); bad1: kfree(dmc); bad: return r; } static void flashcache_dtr_stats_print(struct cache_c *dmc) { int read_hit_pct, write_hit_pct, dirty_write_hit_pct; struct flashcache_stats *stats = &dmc->flashcache_stats; u_int64_t cache_pct, dirty_pct; char *cache_mode; int i; if (stats->reads > 0) read_hit_pct = stats->read_hits * 100 / stats->reads; else read_hit_pct = 0; if (stats->writes > 0) { write_hit_pct = stats->write_hits * 100 / stats->writes; dirty_write_hit_pct = stats->dirty_write_hits * 100 / stats->writes; } else { write_hit_pct = 0; dirty_write_hit_pct = 0; } DMINFO("stats: \n\treads(%lu), writes(%lu)", stats->reads, stats->writes); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { DMINFO("\tread hits(%lu), read hit percent(%d)\n" \ "\twrite hits(%lu) write hit percent(%d)\n" \ "\tdirty write hits(%lu) dirty write hit percent(%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\twrite invalidates(%lu), read invalidates(%lu)\n" , stats->read_hits, read_hit_pct, stats->write_hits, write_hit_pct, stats->dirty_write_hits, dirty_write_hit_pct, stats->replace, stats->wr_replace, stats->wr_invalidates, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMINFO("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMINFO("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tmetadata dirties(%lu), metadata cleans(%lu)\n" \ "\tmetadata batch(%lu) metadata ssd writes(%lu)\n" \ "\tcleanings(%lu) fallow cleanings(%lu)\n" \ "\tno room(%lu) front merge(%lu) back merge(%lu)\n", stats->enqueues, stats->pending_inval, stats->md_write_dirty, stats->md_write_clean, stats->md_write_batch, stats->md_ssd_writes, stats->cleanings, stats->fallow_cleanings, stats->noroom, stats->front_merge, stats->back_merge); } else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) { DMINFO("\tread hits(%lu), read hit percent(%d)\n" \ "\twrite hits(%lu) write hit percent(%d)\n" \ "\treplacement(%lu)\n" \ "\twrite invalidates(%lu), read invalidates(%lu)\n", stats->read_hits, read_hit_pct, stats->write_hits, write_hit_pct, stats->replace, stats->wr_invalidates, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMINFO("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMINFO("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tno room(%lu)\n", stats->enqueues, stats->pending_inval, stats->noroom); } else { /* WRITE_AROUND */ DMINFO("\tread hits(%lu), read hit percent(%d)\n" \ "\treplacement(%lu)\n" \ "\tinvalidates(%lu)\n", stats->read_hits, read_hit_pct, stats->replace, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMINFO("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMINFO("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tno room(%lu)\n", stats->enqueues, stats->pending_inval, stats->noroom); } /* All modes */ DMINFO("\tdisk reads(%lu), disk writes(%lu) ssd reads(%lu) ssd writes(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu), uncached IO requeue(%lu)\n" \ "\tdisk read errors(%d), disk write errors(%d) ssd read errors(%d) ssd write errors(%d)\n" \ "\tuncached sequential reads(%lu), uncached sequential writes(%lu)\n" \ "\tpid_adds(%lu), pid_dels(%lu), pid_drops(%lu) pid_expiry(%lu)", stats->disk_reads, stats->disk_writes, stats->ssd_reads, stats->ssd_writes, stats->uncached_reads, stats->uncached_writes, stats->uncached_io_requeue, dmc->flashcache_errors.disk_read_errors, dmc->flashcache_errors.disk_write_errors, dmc->flashcache_errors.ssd_read_errors, dmc->flashcache_errors.ssd_write_errors, stats->uncached_sequential_reads, stats->uncached_sequential_writes, stats->pid_adds, stats->pid_dels, stats->pid_drops, stats->expiry); if (dmc->size > 0) { dirty_pct = ((u_int64_t)atomic_read(&dmc->nr_dirty) * 100) / dmc->size; cache_pct = ((u_int64_t)atomic_read(&dmc->cached_blocks) * 100) / dmc->size; } else { cache_pct = 0; dirty_pct = 0; } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) cache_mode = "WRITE_BACK"; else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) cache_mode = "WRITE_THROUGH"; else cache_mode = "WRITE_AROUND"; DMINFO("conf:\n" \ "\tvirt dev (%s), ssd dev (%s), disk dev (%s) cache mode(%s)\n" \ "\tcapacity(%luM), associativity(%u), data block size(%uK) metadata block size(%ub)\n" \ "\tskip sequential thresh(%uK)\n" \ "\ttotal blocks(%lu), cached blocks(%d), cache percent(%d)\n" \ "\tdirty blocks(%d), dirty percent(%d)\n", dmc->dm_vdevname, dmc->cache_devname, dmc->disk_devname, cache_mode, dmc->size*dmc->block_size>>11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), dmc->md_block_size * 512, dmc->sysctl_skip_seq_thresh_kb, dmc->size, atomic_read(&dmc->cached_blocks), (int)cache_pct, atomic_read(&dmc->nr_dirty), (int)dirty_pct); DMINFO("\tnr_queued(%d)\n", atomic_read(&dmc->pending_jobs_count)); DMINFO("Size Hist: "); for (i = 1 ; i <= 32 ; i++) { if (size_hist[i] > 0) DMINFO("%d:%llu ", i*512, size_hist[i]); } } /* * Destroy the cache mapping. */ void flashcache_dtr(struct dm_target *ti) { struct cache_c *dmc = (struct cache_c *) ti->private; struct cache_c **nodepp; int i; int nr_queued = 0; flashcache_dtr_procfs(dmc); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { flashcache_sync_for_remove(dmc); flashcache_writeback_md_store(dmc); } if (!dmc->sysctl_fast_remove && atomic_read(&dmc->nr_dirty) > 0) DMERR("Could not sync %d blocks to disk, cache still dirty", atomic_read(&dmc->nr_dirty)); DMINFO("cache jobs %d, pending jobs %d", atomic_read(&nr_cache_jobs), atomic_read(&nr_pending_jobs)); for (i = 0 ; i < dmc->size ; i++) nr_queued += dmc->cache[i].nr_queued; DMINFO("cache queued jobs %d", nr_queued); flashcache_dtr_stats_print(dmc); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, flashcache_wait_schedule, TASK_UNINTERRUPTIBLE); #else (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, TASK_UNINTERRUPTIBLE); #endif nodepp = &cache_list_head; while (*nodepp != NULL) { if (*nodepp == dmc) { *nodepp = dmc->next_cache; break; } nodepp = &((*nodepp)->next_cache); } clear_bit(FLASHCACHE_UPDATE_LIST, &flashcache_control->synch_flags); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) smp_mb__after_clear_bit(); #else smp_mb__after_atomic(); #endif wake_up_bit(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST); flashcache_hash_destroy(dmc); flashcache_diskclean_destroy(dmc); flashcache_kcopy_destroy(dmc); vfree((void *)dmc->cache); vfree((void *)dmc->cache_sets); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) vfree((void *)dmc->md_blocks_buf); flashcache_del_all_pids(dmc, FLASHCACHE_WHITELIST, 1); flashcache_del_all_pids(dmc, FLASHCACHE_BLACKLIST, 1); VERIFY(dmc->num_whitelist_pids == 0); VERIFY(dmc->num_blacklist_pids == 0); dm_put_device(ti, dmc->disk_dev); dm_put_device(ti, dmc->cache_dev); kfree(dmc); } void flashcache_status_info(struct cache_c *dmc, status_type_t type, char *result, unsigned int maxlen) { int read_hit_pct, write_hit_pct, dirty_write_hit_pct; int sz = 0; /* DMEMIT */ struct flashcache_stats *stats = &dmc->flashcache_stats; if (stats->reads > 0) read_hit_pct = stats->read_hits * 100 / stats->reads; else read_hit_pct = 0; if (stats->writes > 0) { write_hit_pct = stats->write_hits * 100 / stats->writes; dirty_write_hit_pct = stats->dirty_write_hits * 100 / stats->writes; } else { write_hit_pct = 0; dirty_write_hit_pct = 0; } DMEMIT("stats: \n\treads(%lu), writes(%lu)\n", stats->reads, stats->writes); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { DMEMIT("\tread hits(%lu), read hit percent(%d)\n" \ "\twrite hits(%lu) write hit percent(%d)\n" \ "\tdirty write hits(%lu) dirty write hit percent(%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\twrite invalidates(%lu), read invalidates(%lu)\n", stats->read_hits, read_hit_pct, stats->write_hits, write_hit_pct, stats->dirty_write_hits, dirty_write_hit_pct, stats->replace, stats->wr_replace, stats->wr_invalidates, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMEMIT("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMEMIT("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tmetadata dirties(%lu), metadata cleans(%lu)\n" \ "\tmetadata batch(%lu) metadata ssd writes(%lu)\n" \ "\tcleanings(%lu) fallow cleanings(%lu)\n" \ "\tno room(%lu) front merge(%lu) back merge(%lu)\n" \ "\tforce_clean_block(%lu)\n", stats->enqueues, stats->pending_inval, stats->md_write_dirty, stats->md_write_clean, stats->md_write_batch, stats->md_ssd_writes, stats->cleanings, stats->fallow_cleanings, stats->noroom, stats->front_merge, stats->back_merge, stats->force_clean_block); } else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) { DMEMIT("\tread hits(%lu), read hit percent(%d)\n" \ "\twrite hits(%lu) write hit percent(%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\twrite invalidates(%lu), read invalidates(%lu)\n", stats->read_hits, read_hit_pct, stats->write_hits, write_hit_pct, stats->replace, stats->wr_replace, stats->wr_invalidates, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMEMIT("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMEMIT("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tno room(%lu)\n", stats->enqueues, stats->pending_inval, stats->noroom); } else { /* WRITE_AROUND */ DMEMIT("\tread hits(%lu), read hit percent(%d)\n" \ "\treplacement(%lu), write replacement(%lu)\n" \ "\tinvalidates(%lu)\n", stats->read_hits, read_hit_pct, stats->replace, stats->wr_replace, stats->rd_invalidates); #ifdef FLASHCACHE_DO_CHECKSUMS DMEMIT("\tchecksum store(%ld), checksum valid(%ld), checksum invalid(%ld)\n", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif DMEMIT("\tpending enqueues(%lu), pending inval(%lu)\n" \ "\tno room(%lu)\n", stats->enqueues, stats->pending_inval, stats->noroom); } /* All modes */ DMEMIT("\tdisk reads(%lu), disk writes(%lu) ssd reads(%lu) ssd writes(%lu)\n" \ "\tuncached reads(%lu), uncached writes(%lu), uncached IO requeue(%lu)\n" \ "\tdisk read errors(%d), disk write errors(%d) ssd read errors(%d) ssd write errors(%d)\n" \ "\tuncached sequential reads(%lu), uncached sequential writes(%lu)\n" \ "\tpid_adds(%lu), pid_dels(%lu), pid_drops(%lu) pid_expiry(%lu)\n" \ "\tlru hot blocks(%d), lru warm blocks(%d)\n" \ "\tlru promotions(%lu), lru demotions(%lu)", stats->disk_reads, stats->disk_writes, stats->ssd_reads, stats->ssd_writes, stats->uncached_reads, stats->uncached_writes, stats->uncached_io_requeue, dmc->flashcache_errors.disk_read_errors, dmc->flashcache_errors.disk_write_errors, dmc->flashcache_errors.ssd_read_errors, dmc->flashcache_errors.ssd_write_errors, stats->uncached_sequential_reads, stats->uncached_sequential_writes, stats->pid_adds, stats->pid_dels, stats->pid_drops, stats->expiry, dmc->lru_hot_blocks, dmc->lru_warm_blocks, stats->lru_promotions, stats->lru_demotions); if (dmc->sysctl_io_latency_hist) { int i; DMEMIT("\nIO Latency Histogram: \n"); for (i = 1 ; i <= IO_LATENCY_BUCKETS ; i++) { DMEMIT("< %d\tusecs : %lu\n", i * IO_LATENCY_GRAN_USECS, dmc->latency_hist[i - 1]); } DMEMIT("> 10\tmsecs : %lu", dmc->latency_hist_10ms); } } static void flashcache_status_table(struct cache_c *dmc, status_type_t type, char *result, unsigned int maxlen) { u_int64_t cache_pct, dirty_pct; int i; int sz = 0; /* DMEMIT */ char *cache_mode; if (dmc->size > 0) { dirty_pct = ((u_int64_t)atomic_read(&dmc->nr_dirty) * 100) / dmc->size; cache_pct = ((u_int64_t)atomic_read(&dmc->cached_blocks) * 100) / dmc->size; } else { cache_pct = 0; dirty_pct = 0; } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { if (dmc->write_only_cache) cache_mode = "WRITE_CACHE"; else cache_mode = "WRITE_BACK"; } else if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) cache_mode = "WRITE_THROUGH"; else cache_mode = "WRITE_AROUND"; DMEMIT("conf:\n"); DMEMIT("\tssd dev (%s), disk dev (%s) cache mode(%s)\n", dmc->cache_devname, dmc->disk_devname, cache_mode); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { DMEMIT("\tcapacity(%luM), associativity(%u), data block size(%uK) metadata block size(%ub)\n", dmc->size*dmc->block_size>>11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), dmc->md_block_size * 512); } else { DMEMIT("\tcapacity(%luM), associativity(%u), data block size(%uK)\n", dmc->size*dmc->block_size>>11, dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT)); } DMEMIT("\tdisk assoc(%uK)\n", dmc->disk_assoc >> (10 - SECTOR_SHIFT)); DMEMIT("\tskip sequential thresh(%uK)\n", dmc->sysctl_skip_seq_thresh_kb); DMEMIT("\ttotal blocks(%lu), cached blocks(%d), cache percent(%d)\n", dmc->size, atomic_read(&dmc->cached_blocks), (int)cache_pct); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { DMEMIT("\tdirty blocks(%d), dirty percent(%d)\n", atomic_read(&dmc->nr_dirty), (int)dirty_pct); } DMEMIT("\tnr_queued(%d)\n", atomic_read(&dmc->pending_jobs_count)); DMEMIT("Size Hist: "); for (i = 1 ; i <= 32 ; i++) { if (size_hist[i] > 0) DMEMIT("%d:%llu ", i*512, size_hist[i]); } #if 0 DMEMIT("\n"); DMEMIT("Write Clustering Hist: "); for (i = 0 ; i < FLASHCACHE_WRITE_CLUST_HIST_SIZE ; i++) { if (dmc->write_clust_hist[i] > 0) DMEMIT("%d:%llu ", i, dmc->write_clust_hist[i]); } DMEMIT(">=128:%llu ", dmc->write_clust_hist_ovf); #endif } /* * Report cache status: * Output cache stats upon request of device status; * Output cache configuration upon request of table status. */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) void flashcache_status(struct dm_target *ti, status_type_t type, unsigned int unused_status_flags, char *result, unsigned int maxlen) #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) int flashcache_status(struct dm_target *ti, status_type_t type, unsigned int unused_status_flags, char *result, unsigned int maxlen) #else int flashcache_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) #endif { struct cache_c *dmc = (struct cache_c *) ti->private; switch (type) { case STATUSTYPE_INFO: flashcache_status_info(dmc, type, result, maxlen); break; case STATUSTYPE_TABLE: flashcache_status_table(dmc, type, result, maxlen); break; } #if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) return 0; #endif } static int flashcache_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct cache_c *dmc = (struct cache_c *) ti->private; int ret = 0; ret = fn(ti, dmc->cache_dev, 0, to_sector(dmc->cache_dev->bdev->bd_inode->i_size), data); if (!ret) ret = fn(ti, dmc->disk_dev, 0, ti->len, data); return ret; } static struct target_type flashcache_target = { .name = "flashcache", .version= {1, 0, 4}, .module = THIS_MODULE, .ctr = flashcache_ctr, .dtr = flashcache_dtr, .map = flashcache_map, .status = flashcache_status, .ioctl = flashcache_ioctl, .iterate_devices = flashcache_iterate_devices, }; static void flashcache_sync_for_remove(struct cache_c *dmc) { do { atomic_set(&dmc->remove_in_prog, SLOW_REMOVE); /* Stop cleaning of sets */ if (!dmc->sysctl_fast_remove) { /* * Kick off cache cleaning. client_destroy will wait for cleanings * to finish. */ printk(KERN_ALERT "Cleaning %d blocks please WAIT", atomic_read(&dmc->nr_dirty)); /* Tune up the cleaning parameters to clean very aggressively */ dmc->max_clean_ios_total = 20; dmc->max_clean_ios_set = 10; flashcache_sync_all(dmc); } else { /* Needed to abort any in-progress cleanings, leave blocks DIRTY */ atomic_set(&dmc->remove_in_prog, FAST_REMOVE); printk(KERN_ALERT "Fast flashcache remove Skipping cleaning of %d blocks", atomic_read(&dmc->nr_dirty)); } /* * We've prevented new cleanings from starting (for the fast remove case) * and we will wait for all in progress cleanings to exit. * Wait a few seconds for everything to quiesce before writing out the * cache metadata. */ msleep(FLASHCACHE_SYNC_REMOVE_DELAY); /* Wait for all the dirty blocks to get written out, and any other IOs */ wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs)); cancel_delayed_work(&dmc->delayed_clean); flush_scheduled_work(); } while (!dmc->sysctl_fast_remove && atomic_read(&dmc->nr_dirty) > 0); } static int flashcache_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { struct cache_c *dmc; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, flashcache_wait_schedule, TASK_UNINTERRUPTIBLE); #else (void)wait_on_bit_lock(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST, TASK_UNINTERRUPTIBLE); #endif for (dmc = cache_list_head ; dmc != NULL ; dmc = dmc->next_cache) { if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { flashcache_sync_for_remove(dmc); flashcache_writeback_md_store(dmc); dm_put_device(dmc->tgt, dmc->cache_dev); dm_put_device(dmc->tgt, dmc->disk_dev); } } clear_bit(FLASHCACHE_UPDATE_LIST, &flashcache_control->synch_flags); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) smp_mb__after_clear_bit(); #else smp_mb__after_atomic(); #endif wake_up_bit(&flashcache_control->synch_flags, FLASHCACHE_UPDATE_LIST); return NOTIFY_DONE; } /* * The notifiers are registered in descending order of priority and * executed in descending order or priority. We should be run before * any notifiers of ssd's or other block devices. Typically, devices * use a priority of 0. * XXX - If in the future we happen to use a md device as the cache * block device, we have a problem because md uses a priority of * INT_MAX as well. But we want to run before the md's reboot notifier ! */ static struct notifier_block flashcache_notifier = { .notifier_call = flashcache_notify_reboot, .next = NULL, .priority = INT_MAX, /* should be > ssd pri's and disk dev pri's */ }; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) struct dm_kcopyd_client *flashcache_kcp_client; /* Kcopyd client for writing back data */ #else struct kcopyd_client *flashcache_kcp_client; /* Kcopyd client for writing back data */ #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) struct dm_io_client *flashcache_io_client; /* Client memory pool*/ #endif /* * Initiate a cache target. */ int __init flashcache_init(void) { int r; r = flashcache_jobs_init(); if (r) return r; atomic_set(&nr_cache_jobs, 0); atomic_set(&nr_pending_jobs, 0); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) r = dm_io_get(FLASHCACHE_ASYNC_SIZE); if (r) { DMERR("flashcache_init: Could not size dm io pool"); return r; } r = kcopyd_client_create(FLASHCACHE_COPY_PAGES, &flashcache_kcp_client); if (r) { DMERR("flashcache_init: Failed to initialize kcopyd client"); dm_io_put(FLASHCACHE_ASYNC_SIZE); return r; } #else /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) */ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= 1538)) flashcache_io_client = dm_io_client_create(); #else flashcache_io_client = dm_io_client_create(FLASHCACHE_COPY_PAGES); #endif if (IS_ERR(flashcache_io_client)) { DMERR("flashcache_init: Failed to initialize DM IO client"); return r; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) r = kcopyd_client_create(FLASHCACHE_COPY_PAGES, &flashcache_kcp_client); #elif ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= 1538) && (RHEL_RELEASE_CODE <= 1540)) flashcache_kcp_client = dm_kcopyd_client_create(); if ((r = IS_ERR(flashcache_kcp_client))) { r = PTR_ERR(flashcache_kcp_client); } #elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= 1541)) flashcache_kcp_client = dm_kcopyd_client_create(NULL); if ((r = IS_ERR(flashcache_kcp_client))) { r = PTR_ERR(flashcache_kcp_client); } #else /* .26 <= VERSION < 3.0.0 */ r = dm_kcopyd_client_create(FLASHCACHE_COPY_PAGES, &flashcache_kcp_client); #endif /* .26 <= VERSION < 3.0.0 */ if (r) { dm_io_client_destroy(flashcache_io_client); DMERR("flashcache_init: Failed to initialize kcopyd client"); return r; } #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) INIT_WORK(&_kcached_wq, do_work, NULL); #else INIT_WORK(&_kcached_wq, do_work); #endif for (r = 0 ; r < 33 ; r++) size_hist[r] = 0; r = dm_register_target(&flashcache_target); if (r < 0) { DMERR("cache: register failed %d", r); } printk("flashcache: %s initialized\n", flashcache_sw_version); flashcache_module_procfs_init(); flashcache_control = (struct flashcache_control_s *) kmalloc(sizeof(struct flashcache_control_s), GFP_KERNEL); flashcache_control->synch_flags = 0; register_reboot_notifier(&flashcache_notifier); return r; } /* * Destroy a cache target. */ void __exit flashcache_exit(void) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) int r = dm_unregister_target(&flashcache_target); if (r < 0) DMERR("cache: unregister failed %d", r); #else dm_unregister_target(&flashcache_target); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) kcopyd_client_destroy(flashcache_kcp_client); #else dm_kcopyd_client_destroy(flashcache_kcp_client); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) dm_io_client_destroy(flashcache_io_client); #else dm_io_put(FLASHCACHE_ASYNC_SIZE); #endif unregister_reboot_notifier(&flashcache_notifier); flashcache_jobs_exit(); flashcache_module_procfs_release(); kfree(flashcache_control); } module_init(flashcache_init); module_exit(flashcache_exit); EXPORT_SYMBOL(flashcache_writeback_load); EXPORT_SYMBOL(flashcache_writeback_create); EXPORT_SYMBOL(flashcache_writeback_md_store); MODULE_DESCRIPTION(DM_NAME " Facebook flash cache target"); MODULE_AUTHOR("Mohan - based on code by Ming"); MODULE_LICENSE("GPL"); flashcache-3.1.3+git20150701/src/flashcache_ioctl.c000066400000000000000000000413671254507146700214320ustar00rootroot00000000000000/**************************************************************************** * flashcache_ioctl.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" #include "flashcache_ioctl.h" static int flashcache_find_pid_locked(struct cache_c *dmc, pid_t pid, int which_list); static void flashcache_del_pid_locked(struct cache_c *dmc, pid_t pid, int which_list); static int flashcache_find_pid_locked(struct cache_c *dmc, pid_t pid, int which_list) { struct flashcache_cachectl_pid *pid_list; pid_list = ((which_list == FLASHCACHE_WHITELIST) ? dmc->whitelist_head : dmc->blacklist_head); for ( ; pid_list != NULL ; pid_list = pid_list->next) { if (pid_list->pid == pid) return 1; } return 0; } static void flashcache_drop_pids(struct cache_c *dmc, int which_list) { if (which_list == FLASHCACHE_WHITELIST) { while (dmc->num_whitelist_pids >= dmc->sysctl_max_pids) { VERIFY(dmc->whitelist_head != NULL); flashcache_del_pid_locked(dmc, dmc->whitelist_tail->pid, which_list); dmc->flashcache_stats.pid_drops++; } } else { while (dmc->num_blacklist_pids >= dmc->sysctl_max_pids) { VERIFY(dmc->blacklist_head != NULL); flashcache_del_pid_locked(dmc, dmc->blacklist_tail->pid, which_list); dmc->flashcache_stats.pid_drops++; } } } static void flashcache_add_pid(struct cache_c *dmc, pid_t pid, int which_list) { struct flashcache_cachectl_pid *new; unsigned long flags; new = kmalloc(sizeof(struct flashcache_cachectl_pid), GFP_KERNEL); new->pid = pid; new->next = NULL; new->expiry = jiffies + dmc->sysctl_pid_expiry_secs * HZ; spin_lock_irqsave(&dmc->ioctl_lock, flags); if (which_list == FLASHCACHE_WHITELIST) { if (dmc->num_whitelist_pids > dmc->sysctl_max_pids) flashcache_drop_pids(dmc, which_list); } else { if (dmc->num_blacklist_pids > dmc->sysctl_max_pids) flashcache_drop_pids(dmc, which_list); } if (flashcache_find_pid_locked(dmc, pid, which_list) == 0) { struct flashcache_cachectl_pid **head, **tail; if (which_list == FLASHCACHE_WHITELIST) { head = &dmc->whitelist_head; tail = &dmc->whitelist_tail; } else { head = &dmc->blacklist_head; tail = &dmc->blacklist_tail; } /* Add the new pid to the tail */ new->prev = *tail; if (*head == NULL) { VERIFY(*tail == NULL); *head = new; } else { VERIFY(*tail != NULL); (*tail)->next = new; } *tail = new; if (which_list == FLASHCACHE_WHITELIST) dmc->num_whitelist_pids++; else dmc->num_blacklist_pids++; dmc->flashcache_stats.pid_adds++; /* When adding the first entry to list, set expiry check timeout */ if (*head == new) dmc->pid_expire_check = jiffies + ((dmc->sysctl_pid_expiry_secs + 1) * HZ); } else kfree(new); spin_unlock_irqrestore(&dmc->ioctl_lock, flags); return; } static void flashcache_del_pid_locked(struct cache_c *dmc, pid_t pid, int which_list) { struct flashcache_cachectl_pid *node; struct flashcache_cachectl_pid **head, **tail; if (which_list == FLASHCACHE_WHITELIST) { head = &dmc->whitelist_head; tail = &dmc->whitelist_tail; } else { head = &dmc->blacklist_head; tail = &dmc->blacklist_tail; } for (node = *tail ; node != NULL ; node = node->prev) { if (which_list == FLASHCACHE_WHITELIST) VERIFY(dmc->num_whitelist_pids > 0); else VERIFY(dmc->num_blacklist_pids > 0); if (node->pid == pid) { if (node->prev == NULL) { *head = node->next; if (node->next) node->next->prev = NULL; } else node->prev->next = node->next; if (node->next == NULL) { *tail = node->prev; if (node->prev) node->prev->next = NULL; } else node->next->prev = node->prev; kfree(node); dmc->flashcache_stats.pid_dels++; if (which_list == FLASHCACHE_WHITELIST) dmc->num_whitelist_pids--; else dmc->num_blacklist_pids--; return; } } } static void flashcache_del_pid(struct cache_c *dmc, pid_t pid, int which_list) { unsigned long flags; spin_lock_irqsave(&dmc->ioctl_lock, flags); flashcache_del_pid_locked(dmc, pid, which_list); spin_unlock_irqrestore(&dmc->ioctl_lock, flags); } /* * This removes all "dead" pids. Pids that may have not cleaned up. */ void flashcache_del_all_pids(struct cache_c *dmc, int which_list, int force) { struct flashcache_cachectl_pid *node, **tail; unsigned long flags; if (which_list == FLASHCACHE_WHITELIST) tail = &dmc->whitelist_tail; else tail = &dmc->blacklist_tail; rcu_read_lock(); spin_lock_irqsave(&dmc->ioctl_lock, flags); node = *tail; while (node != NULL) { #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31)) || (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,38)) if (force == 0) { struct task_struct *task; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) task = find_task_by_pid_type(PIDTYPE_PID, node->pid); #elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) task = find_task_by_vpid(node->pid); #else task = pid_task(find_vpid(node->pid), PIDTYPE_PID); #endif /* * If that task was found, don't remove it ! * This prevents a rogue "delete all" from removing * every thread from the list. */ if (task) { node = node->prev; continue; } } #endif flashcache_del_pid_locked(dmc, node->pid, which_list); node = *tail; } spin_unlock_irqrestore(&dmc->ioctl_lock, flags); rcu_read_unlock(); } static void flashcache_pid_expiry_list_locked(struct cache_c *dmc, int which_list) { struct flashcache_cachectl_pid **head, **tail, *node; if (which_list == FLASHCACHE_WHITELIST) { head = &dmc->whitelist_head; tail = &dmc->whitelist_tail; } else { head = &dmc->blacklist_head; tail = &dmc->blacklist_tail; } for (node = *head ; node != NULL ; node = node->next) { if (which_list == FLASHCACHE_WHITELIST) VERIFY(dmc->num_whitelist_pids > 0); else VERIFY(dmc->num_blacklist_pids > 0); if (time_after(node->expiry, jiffies)) continue; if (node->prev == NULL) { *head = node->next; if (node->next) node->next->prev = NULL; } else node->prev->next = node->next; if (node->next == NULL) { *tail = node->prev; if (node->prev) node->prev->next = NULL; } else node->next->prev = node->prev; kfree(node); if (which_list == FLASHCACHE_WHITELIST) dmc->num_whitelist_pids--; else dmc->num_blacklist_pids--; dmc->flashcache_stats.expiry++; } } void flashcache_pid_expiry_all_locked(struct cache_c *dmc) { if (likely(time_before(jiffies, dmc->pid_expire_check))) return; flashcache_pid_expiry_list_locked(dmc, FLASHCACHE_WHITELIST); flashcache_pid_expiry_list_locked(dmc, FLASHCACHE_BLACKLIST); dmc->pid_expire_check = jiffies + (dmc->sysctl_pid_expiry_secs + 1) * HZ; } /* * Is the IO cacheable, depending on global cacheability and the white/black * lists ? This function is a bit confusing because we want to support inheritance * of cacheability across pthreads (so we use the tgid). But when an entire thread * group is added to the white/black list, we want to provide for exceptions for * individual threads as well. * The Rules (in decreasing order of priority) : * 1) Check the pid (thread id) against the list. * 2) Check the tgid against the list, then check for exceptions within the tgid. * 3) Possibly don't cache sequential i/o. */ int flashcache_uncacheable(struct cache_c *dmc, struct bio *bio) { int dontcache; if (unlikely(dmc->bypass_cache)) { dontcache = 1; goto out; } if (dmc->sysctl_cache_all) { /* If the tid has been blacklisted, we don't cache at all. This overrides everything else */ dontcache = flashcache_find_pid_locked(dmc, current->pid, FLASHCACHE_BLACKLIST); if (dontcache) goto out; /* Is the tgid in the blacklist ? */ dontcache = flashcache_find_pid_locked(dmc, current->tgid, FLASHCACHE_BLACKLIST); /* * If we found the tgid in the blacklist, is there a whitelist * exception entered for this thread ? */ if (dontcache) { if (flashcache_find_pid_locked(dmc, current->pid, FLASHCACHE_WHITELIST)) { dontcache = 0; goto out; } } /* Finally, if we are neither in a whitelist or a blacklist, * do a final check to see if this is sequential i/o. If * the relevant sysctl is set, we will skip it. */ if (!dontcache) dontcache = skip_sequential_io(dmc, bio); } else { /* cache nothing */ /* If the tid has been whitelisted, we cache This overrides everything else */ dontcache = !flashcache_find_pid_locked(dmc, current->pid, FLASHCACHE_WHITELIST); if (!dontcache) goto out; /* Is the tgid in the whitelist ? */ dontcache = !flashcache_find_pid_locked(dmc, current->tgid, FLASHCACHE_WHITELIST); /* * If we found the tgid in the whitelist, is there a black list * exception entered for this thread ? */ if (!dontcache) { if (flashcache_find_pid_locked(dmc, current->pid, FLASHCACHE_BLACKLIST)) dontcache = 1; } /* No sequential handling here. If we add to the whitelist, * everything is cached, sequential or not. */ } out: return dontcache; } /* Below 2 functions manage the LRU cache of recent IO 'flows'. * A sequential IO will only take up one slot (we keep updating the * last sector seen) but random IO will quickly fill multiple slots. * We allocate the LRU cache from a small fixed sized buffer at startup. */ void seq_io_remove_from_lru(struct cache_c *dmc, struct sequential_io *seqio) { if (seqio->prev != NULL) seqio->prev->next = seqio->next; else { VERIFY(dmc->seq_io_head == seqio); dmc->seq_io_head = seqio->next; } if (seqio->next != NULL) seqio->next->prev = seqio->prev; else { VERIFY(dmc->seq_io_tail == seqio); dmc->seq_io_tail = seqio->prev; } } void seq_io_move_to_lruhead(struct cache_c *dmc, struct sequential_io *seqio) { if (likely(seqio->prev != NULL || seqio->next != NULL)) seq_io_remove_from_lru(dmc, seqio); /* Add it to LRU head */ if (dmc->seq_io_head != NULL) dmc->seq_io_head->prev = seqio; seqio->next = dmc->seq_io_head; seqio->prev = NULL; dmc->seq_io_head = seqio; } /* Look for and maybe skip sequential i/o. * * Since performance(SSD) >> performance(HDD) for random i/o, * but performance(SSD) ~= performance(HDD) for sequential i/o, * it may be optimal to save (presumably expensive) SSD cache space for random i/o only. * * We don't know whether a single request is part of a big sequential read/write. * So all we can do is monitor a few requests, and try to spot if they are * continuations of a recent 'flow' of i/o. After several contiguous blocks we consider * it sequential. * * You can tune the threshold with the sysctl skip_seq_thresh_kb (e.g. 64 = 64kb), * or cache all i/o (without checking whether random or sequential) with skip_seq_thresh_kb = 0. */ int skip_sequential_io(struct cache_c *dmc, struct bio *bio) { struct sequential_io *seqio; int sequential = 0; /* Saw > 1 in a row? */ int skip = 0; /* Enough sequential to hit the threshold */ /* sysctl skip sequential threshold = 0 : disable, cache all sequential and random i/o. * This is the default. */ if (dmc->sysctl_skip_seq_thresh_kb == 0) return 0; /* Is it a continuation of recent i/o? Try to find a match. */ DPRINTK("skip_sequential_io: searching for %ld", bio->bi_sector); /* search the list in LRU order so single sequential flow hits first slot */ VERIFY(spin_is_locked(&dmc->ioctl_lock)); for (seqio = dmc->seq_io_head; seqio != NULL && sequential == 0; seqio = seqio->next) { if (bio->bi_sector == seqio->most_recent_sector) { /* Reread or write same sector again. Ignore but move to head */ DPRINTK("skip_sequential_io: repeat"); sequential = 1; if (dmc->seq_io_head != seqio) seq_io_move_to_lruhead(dmc, seqio); } /* i/o to one block more than the previous i/o = sequential */ else if (bio->bi_sector == seqio->most_recent_sector + dmc->block_size) { DPRINTK("skip_sequential_io: sequential found"); /* Update stats. */ seqio->most_recent_sector = bio->bi_sector; seqio->sequential_count++; sequential = 1; /* And move to head, if not head already */ if (dmc->seq_io_head != seqio) seq_io_move_to_lruhead(dmc, seqio); /* Is it now sequential enough to be sure? (threshold expressed in kb) */ if (to_bytes(seqio->sequential_count * dmc->block_size) > dmc->sysctl_skip_seq_thresh_kb * 1024) { DPRINTK("skip_sequential_io: Sequential i/o detected, seq count now %lu", seqio->sequential_count); /* Sufficiently sequential */ skip = 1; } } } if (!sequential) { /* Record the start of some new i/o, maybe we'll spot it as * sequential soon. */ DPRINTK("skip_sequential_io: concluded that its random i/o"); seqio = dmc->seq_io_tail; seq_io_move_to_lruhead(dmc, seqio); DPRINTK("skip_sequential_io: fill in data"); /* Fill in data */ seqio->most_recent_sector = bio->bi_sector; seqio->sequential_count = 1; } DPRINTK("skip_sequential_io: complete."); if (skip) { if (bio_data_dir(bio) == READ) dmc->flashcache_stats.uncached_sequential_reads++; else dmc->flashcache_stats.uncached_sequential_writes++; } return skip; } /* * Add/del pids whose IOs should be non-cacheable. * We limit this number to 100 (arbitrary and sysctl'able). * We also add an expiry to each entry (defaluts at 60 sec, * arbitrary and sysctlable). * This is needed because Linux lacks an "at_exit()" hook * that modules can supply to do any cleanup on process * exit, for cases where the process dies after marking itself * non-cacheable. */ int #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) flashcache_ioctl(struct dm_target *ti, struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) #else flashcache_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) #endif { struct cache_c *dmc = (struct cache_c *) ti->private; struct block_device *bdev = dmc->disk_dev->bdev; struct file fake_file = {}; struct dentry fake_dentry = {}; pid_t pid; switch(cmd) { case FLASHCACHEADDBLACKLIST: if (copy_from_user(&pid, (pid_t *)arg, sizeof(pid_t))) return -EFAULT; flashcache_add_pid(dmc, pid, FLASHCACHE_BLACKLIST); return 0; case FLASHCACHEDELBLACKLIST: if (copy_from_user(&pid, (pid_t *)arg, sizeof(pid_t))) return -EFAULT; flashcache_del_pid(dmc, pid, FLASHCACHE_BLACKLIST); return 0; case FLASHCACHEDELALLBLACKLIST: flashcache_del_all_pids(dmc, FLASHCACHE_BLACKLIST, 0); return 0; case FLASHCACHEADDWHITELIST: if (copy_from_user(&pid, (pid_t *)arg, sizeof(pid_t))) return -EFAULT; flashcache_add_pid(dmc, pid, FLASHCACHE_WHITELIST); return 0; case FLASHCACHEDELWHITELIST: if (copy_from_user(&pid, (pid_t *)arg, sizeof(pid_t))) return -EFAULT; flashcache_del_pid(dmc, pid, FLASHCACHE_WHITELIST); return 0; case FLASHCACHEDELALLWHITELIST: flashcache_del_all_pids(dmc, FLASHCACHE_WHITELIST, 0); return 0; default: fake_file.f_mode = dmc->disk_dev->mode; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) fake_file.f_dentry = &fake_dentry; #else fake_file.f_path.dentry = &fake_dentry; #endif fake_dentry.d_inode = bdev->bd_inode; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); #else return __blkdev_driver_ioctl(dmc->disk_dev->bdev, dmc->disk_dev->mode, cmd, arg); #endif } } flashcache-3.1.3+git20150701/src/flashcache_ioctl.h000066400000000000000000000053611254507146700214310ustar00rootroot00000000000000/**************************************************************************** * flashcache_ioctl.h * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #ifndef FLASHCACHE_IOCTL_H #define FLASHCACHE_IOCTL_H #include #define FLASHCACHE_IOCTL 0xfe enum { FLASHCACHEADDNCPID_CMD=200, FLASHCACHEDELNCPID_CMD, FLASHCACHEDELNCALL_CMD, FLASHCACHEADDWHITELIST_CMD, FLASHCACHEDELWHITELIST_CMD, FLASHCACHEDELWHITELISTALL_CMD, }; #define FLASHCACHEADDNCPID _IOW(FLASHCACHE_IOCTL, FLASHCACHEADDNCPID_CMD, pid_t) #define FLASHCACHEDELNCPID _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCPID_CMD, pid_t) #define FLASHCACHEDELNCALL _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCALL_CMD, pid_t) #define FLASHCACHEADDBLACKLIST FLASHCACHEADDNCPID #define FLASHCACHEDELBLACKLIST FLASHCACHEDELNCPID #define FLASHCACHEDELALLBLACKLIST FLASHCACHEDELNCALL #define FLASHCACHEADDWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEADDWHITELIST_CMD, pid_t) #define FLASHCACHEDELWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELIST_CMD, pid_t) #define FLASHCACHEDELALLWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELISTALL_CMD, pid_t) #ifdef __KERNEL__ #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) int flashcache_ioctl(struct dm_target *ti, struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); #else int flashcache_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg); #endif void flashcache_pid_expiry_all_locked(struct cache_c *dmc); int flashcache_uncacheable(struct cache_c *dmc, struct bio *bio); void seq_io_remove_from_lru(struct cache_c *dmc, struct sequential_io *seqio); void seq_io_move_to_lruhead(struct cache_c *dmc, struct sequential_io *seqio); int skip_sequential_io(struct cache_c *dmc, struct bio *bio); void flashcache_del_all_pids(struct cache_c *dmc, int which_list, int force); #endif /* __KERNEL__ */ #endif flashcache-3.1.3+git20150701/src/flashcache_kcopy.c000066400000000000000000000367371254507146700214520ustar00rootroot00000000000000/**************************************************************************** * flashcache_kcopy.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,21) #include #include #endif #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #include #endif #include "flashcache.h" #include "flashcache_ioctl.h" #ifndef DM_MAPIO_SUBMITTED #define DM_MAPIO_SUBMITTED 0 #endif extern struct work_struct _kcached_wq; extern atomic_t nr_cache_jobs; /* * We do the kcopy'ing ourselves from flash to disk to get better * disk write clustering by kicking off all the reads from flash * first and then doing one very large disk write. */ /* * There are some subtle bugs in this code where we leak copy jobs. * Until we fix that, disable this. * To re-enable this, * 1) Enable the flashcache_copy_data() call in flashcache_clean_set(). * 2) Enable the code in _init and _destroy below. */ #define NUM_KCOPY_JOBS 32 int flashcache_kcopy_init(struct cache_c *dmc) { #if 0 struct flashcache_copy_job *job; int i; dmc->kcopy_jobs_head = NULL; spin_lock_init(&dmc->kcopy_job_alloc_lock); /* Allocate the kcopy jobs and push them onto the list */ for (i = 0 ; i < NUM_KCOPY_JOBS ; i++) { job = kmalloc(sizeof(struct flashcache_copy_job), GFP_NOIO); if (unlikely(job == NULL)) return 1; job->pl_base = vmalloc(dmc->assoc * sizeof(struct page_list)); if (unlikely(job->pl_base == NULL)) { kfree(job); flashcache_kcopy_destroy(dmc); return 1; } job->page_base = vmalloc(dmc->assoc * sizeof(struct page *)); if (unlikely(job->page_base == NULL)) { vfree(job->pl_base); kfree(job); flashcache_kcopy_destroy(dmc); return 1; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) job->job_io_regions.cache = vmalloc(dmc->assoc * sizeof(struct io_region)); #else job->job_io_regions.cache = vmalloc(dmc->assoc * sizeof(struct dm_io_region)); #endif if (unlikely(job->job_io_regions.cache == NULL)) { vfree(job->pl_base); vfree(job->page_base); kfree(job); flashcache_kcopy_destroy(dmc); return 1; } job->job_base = vmalloc(dmc->assoc * sizeof(struct kcached_job *)); if (unlikely(job->job_base == NULL)) { vfree(job->pl_base); vfree(job->page_base); vfree(job->job_io_regions.cache); kfree(job); flashcache_kcopy_destroy(dmc); return 1; } job->next = dmc->kcopy_jobs_head; dmc->kcopy_jobs_head = job; } #else dmc->kcopy_jobs_head = NULL; #endif return 0; } void flashcache_kcopy_destroy(struct cache_c *dmc) { struct flashcache_copy_job *job, *next; for (job = dmc->kcopy_jobs_head ; job != NULL ; job = next) { next = job->next; vfree(job->pl_base); vfree(job->page_base); vfree(job->job_io_regions.cache); vfree(job->job_base); kfree(job); } } static struct flashcache_copy_job * alloc_flashcache_copy_job(struct cache_c *dmc) { unsigned long flags; struct flashcache_copy_job *job; spin_lock_irqsave(&dmc->kcopy_job_alloc_lock, flags); job = dmc->kcopy_jobs_head; if (job != NULL) dmc->kcopy_jobs_head = job->next; spin_unlock_irqrestore(&dmc->kcopy_job_alloc_lock, flags); if (job != NULL) atomic_inc(&nr_cache_jobs); return job; } /* * Important : This does NOT free the kcached jobs here. * They will get freed separately, when metadata writes complete or when * pending IOs complete. If you have not kicked off any of these things where * the kcached_job will get freed later, you need to free those before calling * into this ! * * In the pre-allocated copy_jobs scheme, we free the pages we allocated for * this copy, we added back the copy_job to the preallocated pool. */ static void free_flashcache_copy_job(struct cache_c *dmc, struct flashcache_copy_job *job) { unsigned long flags; int i; for (i = 0 ; i < job->nr_writes ; i++) __free_page(job->page_base[i]); spin_lock_irqsave(&dmc->kcopy_job_alloc_lock, flags); job->next = dmc->kcopy_jobs_head; dmc->kcopy_jobs_head = job; spin_unlock_irqrestore(&dmc->kcopy_job_alloc_lock, flags); atomic_dec(&nr_cache_jobs); } struct flashcache_copy_job * new_flashcache_copy_job(struct cache_c *dmc, int nr_writes, struct dbn_index_pair *writes_list) { struct flashcache_copy_job *job; int i, j; job = alloc_flashcache_copy_job(dmc); if (unlikely(job == NULL)) return NULL; job->dmc = dmc; job->nr_writes = nr_writes; job->reads_completed = 0; job->write_kickoff = 0; job->error = 0; job->pl_list_head = NULL; for (i = 0 ; i < nr_writes ; i++) { job->page_base[i] = alloc_page(GFP_NOIO); if (unlikely(job->page_base[i] == NULL)) { for (j = 0 ; j < i ; j++) __free_page(job->page_base[j]); goto nomem; } job->job_base[i] = new_kcached_job(dmc, NULL, writes_list[i].index); atomic_inc(&dmc->nr_jobs); if (unlikely(job->job_base[i] == NULL)) { for (j = 0 ; j <= i ; j++) __free_page(job->page_base[j]); for (j = 0 ; j < i ; j++) { flashcache_free_cache_job(job->job_base[i]); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } goto nomem; } } /* * Stuff the pages into the page_list structures. * Null terminate each page_list entry, because we want to do * the individial reads first. */ for (i = 0 ; i < nr_writes ; i++) { job->pl_base[i].next = NULL; job->pl_base[i].page = job->page_base[i]; } spin_lock_init(&job->copy_job_spinlock); for (i = 0 ; i < nr_writes ; i++) { job->job_io_regions.cache[i].bdev = dmc->cache_dev->bdev; job->job_io_regions.cache[i].sector = INDEX_TO_CACHE_ADDR(dmc, writes_list[i].index); job->job_io_regions.cache[i].count = dmc->block_size; } job->job_io_regions.disk.bdev = dmc->disk_dev->bdev; job->job_io_regions.disk.sector = writes_list[0].dbn; job->job_io_regions.disk.count = dmc->block_size * nr_writes; return job; nomem: free_flashcache_copy_job(dmc, job); return NULL; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) extern struct dm_io_client *flashcache_io_client; /* Client memory pool*/ #endif static int dm_io_async_pagelist_IO(struct flashcache_copy_job *job, unsigned int num_regions, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region *where, #else struct dm_io_region *where, #endif io_notify_fn fn, int rw, struct page_list *pl) { struct dm_io_request iorq; iorq.bi_rw = rw; iorq.mem.type = DM_IO_PAGE_LIST; iorq.mem.ptr.pl = pl; iorq.mem.offset = 0; iorq.notify.fn = fn; iorq.notify.context = (void *)job; iorq.client = flashcache_io_client; return dm_io(&iorq, num_regions, where, NULL); } void flashcache_handle_read_write_error(struct flashcache_copy_job *job) { struct kcached_job *io_error_job; struct cache_c *dmc = job->dmc; int set; struct cache_set *cache_set; int i, index; DMERR("flashcache: Disk writeback failed ! read/write error %lu", job->job_io_regions.disk.sector); index = CACHE_ADDR_TO_INDEX(dmc, job->job_io_regions.cache[0].sector); set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; for (i = 0 ; i < job->nr_writes ; i++) { index = CACHE_ADDR_TO_INDEX(dmc, job->job_io_regions.cache[i].sector); io_error_job = job->job_base[i]; io_error_job->action = WRITEDISK; spin_lock_irq(&cache_set->set_spin_lock); VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY)); VERIFY(cache_set->clean_inprog > 0); cache_set->clean_inprog--; VERIFY(atomic_read(&dmc->clean_inprog) > 0); atomic_dec(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); io_error_job->error = -EIO; flashcache_do_pending(io_error_job); } free_flashcache_copy_job(dmc, job); flashcache_clean_set(dmc, set, 0); /* Kick off more cleanings */ dmc->flashcache_stats.cleanings++; } void flashcache_clean_md_write_kickoff(struct flashcache_copy_job *job) { struct kcached_job *io_complete_job; struct cache_c *dmc = job->dmc; int set; struct cache_set *cache_set; int i, index; /* If the write errored, clean up */ if (unlikely(job->error)) flashcache_handle_read_write_error(job); else { index = CACHE_ADDR_TO_INDEX(dmc, job->job_io_regions.cache[0].sector); set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; for (i = 0 ; i < job->nr_writes ; i++) { index = CACHE_ADDR_TO_INDEX(dmc, job->job_io_regions.cache[i].sector); io_complete_job = job->job_base[i]; io_complete_job->action = WRITEDISK; spin_lock_irq(&cache_set->set_spin_lock); VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY)); spin_unlock_irq(&cache_set->set_spin_lock); flashcache_md_write(io_complete_job); } free_flashcache_copy_job(dmc, job); } } void flashcache_copy_data_write_callback(unsigned long error, void *context) { struct flashcache_copy_job *job = (struct flashcache_copy_job *)context; if (error) job->dmc->flashcache_errors.disk_write_errors++; job->error = error; push_cleaning_write_complete(job); schedule_work(&_kcached_wq); } void flashcache_clean_write_kickoff(struct flashcache_copy_job *job) { int i; /* * If any of the reads errored, DO NOT kick off the write at all. * Do cleanup here instead ! */ if (unlikely(job->error)) flashcache_handle_read_write_error(job); else { /* * Need to kick off the write. * First chain all of the pages in the page linked list. */ for (i = 0 ; i < job->nr_writes - 1 ; i++) job->pl_base[i].next = &job->pl_base[i + 1]; job->pl_list_head = &job->pl_base[0]; (void)dm_io_async_pagelist_IO(job, 1, &job->job_io_regions.disk, flashcache_copy_data_write_callback, WRITE, job->pl_list_head); } } /* * Handle single read completion. * When all of the reads complete, we kick off the write */ void flashcache_copy_data_read_callback(unsigned long error, void *context) { struct flashcache_copy_job *job = (struct flashcache_copy_job *)context; unsigned long flags; int do_write = 0; spin_lock_irqsave(&job->copy_job_spinlock, flags); VERIFY(job->reads_completed < job->nr_writes); job->reads_completed++; if ((job->reads_completed == job->nr_writes) && (job->write_kickoff == 0)) { do_write = 1; job->write_kickoff = 1; } /* * If any of the reads return an error, we abort the entire cleaning * operation. Stick the error in the job and let the write handle it. * We let ALL of the reads complete and then handle the error when the * last read completes. */ if (error) { job->dmc->flashcache_errors.ssd_read_errors++; job->error = error; } spin_unlock_irqrestore(&job->copy_job_spinlock, flags); if (do_write) { push_cleaning_read_complete(job); schedule_work(&_kcached_wq); } } static void flashcache_verify_chain(struct cache_c *dmc, int nr_writes, struct dbn_index_pair *writes_list) { int i; for (i = 0 ; i < nr_writes - 1 ; i++) if (writes_list[i].dbn + dmc->block_size != writes_list[i+1].dbn) panic("flashcache_verify_chain: chain not contig\n"); } int flashcache_copy_data_one_chain(struct cache_c *dmc, struct cache_set *cache_set, int nr_writes, struct dbn_index_pair *writes_list) { struct flashcache_copy_job *job; int i, index; struct cacheblock *cacheblk; int device_removal = 0; flashcache_verify_chain(dmc, nr_writes, writes_list); job = new_flashcache_copy_job(dmc, nr_writes, writes_list); if (unlikely(atomic_read(&dmc->remove_in_prog) == FAST_REMOVE)) { DMERR("flashcache: Set cleaning aborted for device removal"); if (job) { /* Free the individual kcached jobs first */ for (i = 0 ; i < nr_writes ; i++) { flashcache_free_cache_job(job->job_base[i]); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } free_flashcache_copy_job(dmc, job); } job = NULL; device_removal = 1; } if (unlikely(job == NULL)) { dmc->flashcache_errors.memory_alloc_errors++; spin_lock_irq(&cache_set->set_spin_lock); for (i = 0 ; i < nr_writes ; i++) { index = writes_list[i].index; cacheblk = &dmc->cache[index]; flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); } spin_unlock_irq(&cache_set->set_spin_lock); if (device_removal == 0) DMERR("flashcache: Dirty Writeback (for sync) failed ! Can't allocate memory"); return 1; } /* need to kick off all the reads */ for (i = 0 ; i < nr_writes ; i++) { index = writes_list[i].index; cacheblk = &dmc->cache[index]; spin_lock_irq(&cache_set->set_spin_lock); VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG); VERIFY(cacheblk->cache_state & DIRTY); cache_set->clean_inprog++; atomic_inc(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); dmc->flashcache_stats.ssd_reads++; dmc->flashcache_stats.disk_writes++; /* Kick off DM Read */ dm_io_async_pagelist_IO(job, 1, &job->job_io_regions.cache[i], flashcache_copy_data_read_callback, READ, &job->pl_base[i]); /* XXX - Should we do something with error DM returns ? * We don't check for DM errors elsewhere */ } return 0; } static void flashcache_verify_sorted(struct cache_c *dmc, int nr_writes, struct dbn_index_pair *writes_list) { int i; for (i = 0 ; i < nr_writes - 1 ; i++) if (writes_list[i].dbn >= writes_list[i+1].dbn) panic("flashcache_verify_sorted: writes_list not sorted\n"); } void flashcache_copy_data(struct cache_c *dmc, struct cache_set *cache_set, int nr_writes, struct dbn_index_pair *writes_list) { int i, start_index; flashcache_verify_sorted(dmc, nr_writes, writes_list); start_index = 0; while (start_index < nr_writes) { i = start_index; while ((i < (nr_writes - 1)) && (writes_list[i+1].dbn == writes_list[i].dbn + dmc->block_size)) i++; /* We don't check for error return from this call * because cleanups happens within copy_data_one_chain */ (void)flashcache_copy_data_one_chain(dmc, cache_set, (i - start_index) + 1, &writes_list[start_index]); /* Kick off cleanings for next chain */ start_index = i + 1; } } flashcache-3.1.3+git20150701/src/flashcache_main.c000066400000000000000000002372111254507146700212370ustar00rootroot00000000000000/**************************************************************************** * flashcache_main.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,21) #include #include #endif #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" #include "flashcache_ioctl.h" #ifndef DM_MAPIO_SUBMITTED #define DM_MAPIO_SUBMITTED 0 #endif /* * TODO List : * 1) Management of non cache pids : Needs improvement. Remove registration * on process exits (with a pseudo filesstem'ish approach perhaps) ? * 2) Breaking up the cache spinlock : Right now contention on the spinlock * is not a problem. Might need change in future. * 3) Use the standard linked list manipulation macros instead rolling our own. * 4) Fix a security hole : A malicious process with 'ro' access to a file can * potentially corrupt file data. This can be fixed by copying the data on a * cache read miss. */ #define FLASHCACHE_SW_VERSION "flashcache-3.1.1" char *flashcache_sw_version = FLASHCACHE_SW_VERSION; static void flashcache_read_miss(struct cache_c *dmc, struct bio* bio, int index); static void flashcache_write(struct cache_c *dmc, struct bio* bio); static int flashcache_inval_blocks(struct cache_c *dmc, struct bio *bio); static void flashcache_dirty_writeback(struct cache_c *dmc, int index); void flashcache_sync_blocks(struct cache_c *dmc); static void flashcache_start_uncached_io(struct cache_c *dmc, struct bio *bio); static void flashcache_setlocks_multiget(struct cache_c *dmc, struct bio *bio); static void flashcache_setlocks_multidrop(struct cache_c *dmc, struct bio *bio); extern struct work_struct _kcached_wq; extern u_int64_t size_hist[]; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) extern struct dm_kcopyd_client *flashcache_kcp_client; /* Kcopyd client for writing back data */ #else extern struct kcopyd_client *flashcache_kcp_client; /* Kcopyd client for writing back data */ #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) extern struct dm_io_client *flashcache_io_client; /* Client memory pool*/ #endif int dm_io_async_bvec_pl(unsigned int num_regions, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) struct dm_io_region *where, #else struct io_region *where, #endif int rw, struct page_list *pl, io_notify_fn fn, void *context) { struct dm_io_request iorq; iorq.bi_rw = rw; iorq.mem.type = DM_IO_PAGE_LIST; iorq.mem.ptr.pl = pl; iorq.mem.offset = 0; iorq.notify.fn = fn; iorq.notify.context = context; iorq.client = flashcache_io_client; return dm_io(&iorq, num_regions, where, NULL); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) int dm_io_async_bvec(unsigned int num_regions, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) struct dm_io_region *where, #else struct io_region *where, #endif int rw, struct bio *bio, io_notify_fn fn, void *context) { struct dm_io_request iorq; iorq.bi_rw = rw; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) iorq.mem.type = DM_IO_BIO; iorq.mem.ptr.bio = bio; #else iorq.mem.type = DM_IO_BVEC; iorq.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx; #endif iorq.notify.fn = fn; iorq.notify.context = context; iorq.client = flashcache_io_client; return dm_io(&iorq, num_regions, where, NULL); } #endif /* * A simple 2-hand clock like algorithm is used to identify dirty blocks * that lie fallow in the cache and thus are candidates for cleaning. * Note that we could have such fallow blocks in sets where the dirty blocks * is under the configured threshold. * The hands are spaced fallow_delay seconds apart (one sweep runs every * fallow_delay seconds). The interval is configurable via a sysctl. * Blocks are moved to DIRTY_FALLOW_1, if they are found to be in DIRTY_FALLOW_1 * for fallow_delay seconds or more, they are moved to DIRTY_FALLOW_1 | DIRTY_FALLOW_2, * at which point they are eligible for cleaning. Of course any intervening use * of the block within the interval turns off these 2 bits. * * Cleaning of these blocks happens from the flashcache_clean_set() function. */ void flashcache_detect_fallow(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; if (dmc->cache_mode != FLASHCACHE_WRITE_BACK) return; if ((cacheblk->cache_state & DIRTY) && ((cacheblk->cache_state & BLOCK_IO_INPROG) == 0)) { if ((cacheblk->cache_state & DIRTY_FALLOW_1) == 0) cacheblk->cache_state |= DIRTY_FALLOW_1; else if ((cacheblk->cache_state & DIRTY_FALLOW_2) == 0) { dmc->cache_sets[index / dmc->assoc].dirty_fallow++; cacheblk->cache_state |= DIRTY_FALLOW_2; } } } void flashcache_clear_fallow(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; int set = index / dmc->assoc; if (dmc->cache_mode != FLASHCACHE_WRITE_BACK) return; if (cacheblk->cache_state & FALLOW_DOCLEAN) { if (cacheblk->cache_state & DIRTY_FALLOW_2) { VERIFY(dmc->cache_sets[set].dirty_fallow > 0); dmc->cache_sets[set].dirty_fallow--; } cacheblk->cache_state &= ~FALLOW_DOCLEAN; } } void flashcache_io_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *) context; struct cache_c *dmc = job->dmc; struct bio *bio; unsigned long flags; int index = job->index; struct cacheblock *cacheblk = &dmc->cache[index]; unsigned long disk_error = 0; struct cache_set *cache_set = &dmc->cache_sets[index / dmc->assoc]; VERIFY(index != -1); bio = job->bio; VERIFY(bio != NULL); if (unlikely(error)) { error = -EIO; DMERR("flashcache_io_callback: io error %ld block %lu action %d", error, job->job_io_regions.disk.sector, job->action); if (!dmc->bypass_cache && dmc->cache_mode != FLASHCACHE_WRITE_BACK) { DMERR("flashcache_io_callback: switching %s to BYPASS mode", dmc->cache_devname); dmc->bypass_cache = 1; } } job->error = error; switch (job->action) { case READDISK: DPRINTK("flashcache_io_callback: READDISK %d", index); spin_lock_irqsave(&cache_set->set_spin_lock, flags); if (unlikely(dmc->sysctl_error_inject & READDISK_ERROR)) { job->error = error = -EIO; dmc->sysctl_error_inject &= ~READDISK_ERROR; } VERIFY(cacheblk->cache_state & DISKREADINPROG); spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); if (likely(error == 0)) { /* Kick off the write to the cache */ job->action = READFILL; push_io(job); schedule_work(&_kcached_wq); return; } else { disk_error = -EIO; dmc->flashcache_errors.disk_read_errors++; } break; case READCACHE: DPRINTK("flashcache_io_callback: READCACHE %d", index); spin_lock_irqsave(&cache_set->set_spin_lock, flags); if (unlikely(dmc->sysctl_error_inject & READCACHE_ERROR)) { job->error = error = -EIO; dmc->sysctl_error_inject &= ~READCACHE_ERROR; } VERIFY(cacheblk->cache_state & CACHEREADINPROG); spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); if (unlikely(error)) dmc->flashcache_errors.ssd_read_errors++; #ifdef FLASHCACHE_DO_CHECKSUMS if (likely(error == 0)) { if (flashcache_validate_checksum(job)) { DMERR("flashcache_io_callback: Checksum mismatch at disk offset %lu", job->job_io_regions.disk.sector); error = -EIO; } } #endif break; case READFILL: DPRINTK("flashcache_io_callback: READFILL %d", index); spin_lock_irqsave(&cache_set->set_spin_lock, flags); if (unlikely(dmc->sysctl_error_inject & READFILL_ERROR)) { job->error = error = -EIO; dmc->sysctl_error_inject &= ~READFILL_ERROR; } if (unlikely(error)) dmc->flashcache_errors.ssd_write_errors++; VERIFY(cacheblk->cache_state & DISKREADINPROG); spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); break; case WRITECACHE: DPRINTK("flashcache_io_callback: WRITECACHE %d", index); if (unlikely(dmc->sysctl_error_inject & WRITECACHE_ERROR)) { job->error = error = -EIO; dmc->sysctl_error_inject &= ~WRITECACHE_ERROR; } spin_lock_irqsave(&cache_set->set_spin_lock, flags); VERIFY(cacheblk->cache_state & CACHEWRITEINPROG); spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); if (likely(error == 0)) { if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { #ifdef FLASHCACHE_DO_CHECKSUMS dmc->flashcache_stats.checksum_store++; flashcache_store_checksum(job); /* * We need to update the metadata on a DIRTY->DIRTY as well * since we save the checksums. */ flashcache_md_write(job); return; #else /* Only do cache metadata update on a non-DIRTY->DIRTY transition */ if ((cacheblk->cache_state & DIRTY) == 0) { flashcache_md_write(job); return; } #endif } else { /* cache_mode == WRITE_THROUGH */ /* Writs to both disk and cache completed */ VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_THROUGH); #ifdef FLASHCACHE_DO_CHECKSUMS flashcache_store_checksum(job); job->dmc->flashcache_stats.checksum_store++; #endif } } else { dmc->flashcache_errors.ssd_write_errors++; if (dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) /* * We don't know if the IO failed because of a ssd write * error or a disk write error. Bump up both. * XXX - TO DO. We could check the error bits and allow * the IO to succeed as long as the disk write suceeded. * and invalidate the cache block. */ disk_error = -EIO; dmc->flashcache_errors.disk_write_errors++; } break; } /* * If we get an error in write through || write around modes, * we try the disk directly, after invalidating the cached block. * see flashcache_do_pending_error(). * XXX - We can do the same for writeback as well. But that is more * work. (a) we cannot fall back to disk when a ssd read of a dirty * cacheblock fails (b) we'd need to handle ssd metadata write * failures as well and fall back to disk in those cases as well. * * We track disk errors separately. If we get a disk error (in * writethru or writearound modes) end the IO right here. */ if (likely(error == 0) || (dmc->cache_mode == FLASHCACHE_WRITE_BACK) || disk_error != 0) { flashcache_bio_endio(bio, error, dmc, &job->io_start_time); job->bio = NULL; } /* * The INPROG flag is still set. We cannot turn that off until all the pending requests * processed. We need to loop the pending requests back to a workqueue. We have the job, * add it to the pending req queue. */ spin_lock_irqsave(&cache_set->set_spin_lock, flags); if (unlikely(error || cacheblk->nr_queued > 0)) { spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); push_pending(job); schedule_work(&_kcached_wq); } else { cacheblk->cache_state &= ~BLOCK_IO_INPROG; spin_unlock_irqrestore(&cache_set->set_spin_lock, flags); flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } } void flashcache_free_pending_jobs(struct cache_c *dmc, struct cacheblock *cacheblk, int error) { struct pending_job *pending_job, *freelist = NULL; int index = cacheblk - &dmc->cache[0]; struct cache_set *cache_set = &dmc->cache_sets[index / dmc->assoc]; VERIFY(spin_is_locked(&cache_set->set_spin_lock)); freelist = flashcache_deq_pending(dmc, index); while (freelist != NULL) { pending_job = freelist; freelist = pending_job->next; VERIFY(cacheblk->nr_queued > 0); cacheblk->nr_queued--; flashcache_bio_endio(pending_job->bio, error, dmc, NULL); flashcache_free_pending_job(pending_job); } VERIFY(cacheblk->nr_queued == 0); } /* * Common error handling for everything. * 1) If the block isn't dirty, invalidate it. * 2) De-link all pending IOs that totally or partly overlap this block. * 3) If it was an SSD error (bio != NULL), issue the invalidated block IO and other de-linked pending IOs uncached to disk. * 4) Free the job. */ static void flashcache_do_pending_error(struct kcached_job *job) { struct cache_c *dmc = job->dmc; struct cacheblock *cacheblk = &dmc->cache[job->index]; struct bio *bio = job->bio; int error = job->error; struct pending_job *pjob_list = NULL, *pjob = NULL; struct cache_set *cache_set = &dmc->cache_sets[job->index / dmc->assoc]; if (!dmc->bypass_cache) { DMERR("flashcache_do_pending_error: error %d block %lu action %d", job->error, job->job_io_regions.disk.sector, job->action); } spin_lock_irq(&cache_set->set_spin_lock); VERIFY(cacheblk->cache_state & VALID); /* Invalidate block if possible */ if ((cacheblk->cache_state & DIRTY) == 0) { atomic_dec(&dmc->cached_blocks); dmc->flashcache_stats.pending_inval++; flashcache_hash_remove(dmc, job->index); cacheblk->cache_state &= ~VALID; cacheblk->cache_state |= INVALID; } else VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_BACK); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); if ((cacheblk->cache_state & DIRTY) == 0) flashcache_invalid_insert(dmc, job->index); /* * In case of an error in writethrough or writearound modes, if there * are pending jobs, de-link them from the cacheblock so we can issue disk * IOs below. */ if (bio != NULL) { VERIFY(dmc->cache_mode != FLASHCACHE_WRITE_BACK); pjob_list = flashcache_deq_pending(dmc, cacheblk - &dmc->cache[0]); for (pjob = pjob_list ; pjob != NULL ; pjob = pjob->next) { VERIFY(cacheblk->nr_queued > 0); cacheblk->nr_queued--; } VERIFY(cacheblk->nr_queued == 0); } else flashcache_free_pending_jobs(dmc, cacheblk, job->error); spin_unlock_irq(&cache_set->set_spin_lock); if (bio != NULL) { /* * Cache (read/write) error in write through or write around * mode. Issue the IO directly to disk. We've already invalidated * the cache block above. */ if (!dmc->bypass_cache) /* suppress massive console output */ DMERR("flashcache_do_pending_error: Re-launching errored IO" "to disk, after io error %d block %lu", error, bio->bi_sector); flashcache_start_uncached_io(dmc, bio); while (pjob_list != NULL) { pjob = pjob_list; pjob_list = pjob->next; flashcache_start_uncached_io(dmc, pjob->bio); flashcache_free_pending_job(pjob); } } flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } static void flashcache_do_pending_noerror(struct kcached_job *job) { struct cache_c *dmc = job->dmc; int index = job->index; struct pending_job *pending_job, *freelist; int queued; struct cacheblock *cacheblk = &dmc->cache[index]; struct cache_set *cache_set = &dmc->cache_sets[index / dmc->assoc]; spin_lock_irq(&cache_set->set_spin_lock); if (cacheblk->cache_state & DIRTY) { VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_BACK); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, index); spin_unlock_irq(&cache_set->set_spin_lock); flashcache_dirty_writeback(dmc, index); goto out; } DPRINTK("flashcache_do_pending: Index %d %lx", index, cacheblk->cache_state); VERIFY(cacheblk->cache_state & VALID); atomic_dec(&dmc->cached_blocks); dmc->flashcache_stats.pending_inval++; flashcache_hash_remove(dmc, index); cacheblk->cache_state &= ~VALID; cacheblk->cache_state |= INVALID; /* * The block is in limbo right now. It is not VALID, but the IO_INPROG * bits are set, so it cannot be reused. So it is safe to drop the * cache set lock here. */ spin_unlock_irq(&cache_set->set_spin_lock); freelist = flashcache_deq_pending(dmc, index); while (freelist != NULL) { pending_job = freelist; freelist = pending_job->next; flashcache_setlocks_multiget(dmc, pending_job->bio); VERIFY(!(cacheblk->cache_state & DIRTY)); VERIFY(cacheblk->nr_queued > 0); cacheblk->nr_queued--; if (pending_job->action == INVALIDATE) { DPRINTK("flashcache_do_pending: INVALIDATE %llu", next_job->bio->bi_sector); VERIFY(pending_job->bio != NULL); queued = flashcache_inval_blocks(dmc, pending_job->bio); if (queued) { flashcache_setlocks_multidrop(dmc, pending_job->bio); if (unlikely(queued < 0)) { /* * Memory allocation failure inside inval_blocks. * Fail this io. */ flashcache_bio_endio(pending_job->bio, -EIO, dmc, NULL); } flashcache_free_pending_job(pending_job); continue; } } flashcache_setlocks_multidrop(dmc, pending_job->bio); DPRINTK("flashcache_do_pending: Sending down IO %llu", pending_job->bio->bi_sector); /* Start uncached IO */ flashcache_start_uncached_io(dmc, pending_job->bio); flashcache_free_pending_job(pending_job); } spin_lock_irq(&cache_set->set_spin_lock); VERIFY(cacheblk->nr_queued == 0); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); flashcache_invalid_insert(dmc, index); spin_unlock_irq(&cache_set->set_spin_lock); out: flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } void flashcache_do_pending(struct kcached_job *job) { if (job->error) flashcache_do_pending_error(job); else flashcache_do_pending_noerror(job); } void flashcache_do_io(struct kcached_job *job) { struct bio *bio = job->bio; int r = 0; VERIFY(job->action == READFILL); VERIFY(job->action == READFILL); #ifdef FLASHCACHE_DO_CHECKSUMS flashcache_store_checksum(job); job->dmc->flashcache_stats.checksum_store++; #endif /* Write to cache device */ job->dmc->flashcache_stats.ssd_writes++; r = dm_io_async_bvec(1, &job->job_io_regions.cache, WRITE, bio, flashcache_io_callback, job); VERIFY(r == 0); /* In our case, dm_io_async_bvec() must always return 0 */ } /* * Map a block from the source device to a block in the cache device. */ unsigned long hash_block(struct cache_c *dmc, sector_t dbn) { unsigned long set_number, value; int num_cache_sets = dmc->size >> dmc->assoc_shift; /* * Starting in Flashcache SSD Version 3 : * We map a sequential cluster of disk_assoc blocks onto a given set. * But each disk_assoc cluster can be randomly placed in any set. * But if we are running on an older on-ssd cache, we preserve old * behavior. */ if (dmc->on_ssd_version < 3 || dmc->disk_assoc == 0) { value = (unsigned long) (dbn >> (dmc->block_shift + dmc->assoc_shift)); } else { /* Shift out the low disk_assoc bits */ value = (unsigned long) (dbn >> dmc->disk_assoc_shift); /* Then place it in a random set */ value = jhash_1word(value, 0xbeef); } set_number = value % num_cache_sets; DPRINTK("Hash: %llu(%lu)->%lu", dbn, value, set_number); return set_number; } static void find_valid_dbn(struct cache_c *dmc, sector_t dbn, int start_index, int *index) { *index = flashcache_hash_lookup(dmc, start_index / dmc->assoc, dbn); if (*index == -1) return; if (dmc->sysctl_reclaim_policy == FLASHCACHE_LRU && ((dmc->cache[*index].cache_state & BLOCK_IO_INPROG) == 0)) flashcache_lru_accessed(dmc, *index); /* * If the block was DIRTY and earmarked for cleaning because it was old, make * the block young again. */ flashcache_clear_fallow(dmc, *index); } static int find_invalid_dbn(struct cache_c *dmc, int set) { int index = flashcache_invalid_get(dmc, set); if (index != -1) { if (dmc->sysctl_reclaim_policy == FLASHCACHE_LRU) flashcache_lru_accessed(dmc, index); VERIFY((dmc->cache[index].cache_state & FALLOW_DOCLEAN) == 0); } return index; } /* Search for a slot that we can reclaim */ static void find_reclaim_dbn(struct cache_c *dmc, int start_index, int *index) { if (dmc->sysctl_reclaim_policy == FLASHCACHE_FIFO) flashcache_reclaim_fifo_get_old_block(dmc, start_index, index); else /* flashcache_reclaim_policy == FLASHCACHE_LRU */ flashcache_reclaim_lru_get_old_block(dmc, start_index, index); } /* * dbn is the starting sector, io_size is the number of sectors. */ static int flashcache_lookup(struct cache_c *dmc, struct bio *bio, int *index) { sector_t dbn = bio->bi_sector; #if DMC_DEBUG int io_size = to_sector(bio->bi_size); #endif unsigned long set_number = hash_block(dmc, dbn); int invalid, oldest_clean = -1; int start_index; start_index = dmc->assoc * set_number; DPRINTK("Cache lookup : dbn %llu(%lu), set = %d", dbn, io_size, set_number); find_valid_dbn(dmc, dbn, start_index, index); if (*index >= 0) { DPRINTK("Cache lookup HIT: Block %llu(%lu): VALID index %d", dbn, io_size, *index); /* We found the exact range of blocks we are looking for */ return VALID; } invalid = find_invalid_dbn(dmc, set_number); if (invalid == -1) { /* We didn't find an invalid entry, search for oldest valid entry */ find_reclaim_dbn(dmc, start_index, &oldest_clean); } /* * Cache miss : * We can't choose an entry marked INPROG, but choose the oldest * INVALID or the oldest VALID entry. */ *index = start_index + dmc->assoc; if (invalid != -1) { DPRINTK("Cache lookup MISS (INVALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d", dbn, io_size, set_number, invalid, start_index); *index = invalid; } else if (oldest_clean != -1) { DPRINTK("Cache lookup MISS (VALID): dbn %llu(%lu), set = %d, index = %d, start_index = %d", dbn, io_size, set_number, oldest_clean, start_index); *index = oldest_clean; } else { DPRINTK_LITE("Cache read lookup MISS (NOROOM): dbn %llu(%lu), set = %d", dbn, io_size, set_number); } if (*index < (start_index + dmc->assoc)) return INVALID; else { dmc->flashcache_stats.noroom++; return -1; } } /* * Cache Metadata Update functions */ void flashcache_md_write_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *)context; if (unlikely(error)) job->error = -EIO; else job->error = 0; push_md_complete(job); schedule_work(&_kcached_wq); } static int flashcache_alloc_md_sector(struct kcached_job *job) { struct page *page = NULL; struct cache_c *dmc = job->dmc; unsigned long addr = 0; if (likely((dmc->sysctl_error_inject & MD_ALLOC_SECTOR_ERROR) == 0)) { /* Get physically consecutive pages */ addr = __get_free_pages(GFP_NOIO, get_order(MD_BLOCK_BYTES(job->dmc))); if (addr) page = virt_to_page(addr); } else dmc->sysctl_error_inject &= ~MD_ALLOC_SECTOR_ERROR; if (unlikely(page == NULL)) { job->dmc->flashcache_errors.memory_alloc_errors++; return -ENOMEM; } else { job->pl_base[0].page = page; job->pl_base[0].next = NULL; job->md_block = (struct flash_cacheblock *)addr; return 0; } } static void flashcache_free_md_sector(struct kcached_job *job) { if (job->pl_base[0].page != NULL) __free_pages(job->pl_base[0].page, get_order(MD_BLOCK_BYTES(job->dmc))); job->pl_base[0].page = NULL; } void flashcache_md_write_kickoff(struct kcached_job *job) { struct cache_c *dmc = job->dmc; struct flash_cacheblock *md_block; int md_block_ix; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region where; #else struct dm_io_region where; #endif int i; struct cache_md_block_head *md_block_head; struct kcached_job *orig_job = job; struct cache_set *cache_set = &dmc->cache_sets[job->index / dmc->assoc]; if (flashcache_alloc_md_sector(job)) { DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu", job->action, job->job_io_regions.disk.sector); flashcache_md_write_callback(-EIO, job); return; } /* * Transfer whatever is on the pending queue to the md_io_inprog queue. */ md_block_head = &dmc->md_blocks_buf[INDEX_TO_MD_BLOCK(dmc, job->index)]; spin_lock_irq(&cache_set->set_spin_lock); spin_lock(&md_block_head->md_block_lock); md_block_head->md_io_inprog = md_block_head->queued_updates; md_block_head->queued_updates = NULL; md_block = job->md_block; md_block_ix = INDEX_TO_MD_BLOCK(dmc, job->index) * MD_SLOTS_PER_BLOCK(dmc); /* First copy out the entire md block */ for (i = 0 ; i < MD_SLOTS_PER_BLOCK(dmc) && md_block_ix < dmc->size ; i++, md_block_ix++) { md_block[i].dbn = dmc->cache[md_block_ix].dbn; #ifdef FLASHCACHE_DO_CHECKSUMS md_block[i].checksum = dmc->cache[md_block_ix].checksum; #endif md_block[i].cache_state = dmc->cache[md_block_ix].cache_state & (VALID | INVALID | DIRTY); } /* Then set/clear the DIRTY bit for the "current" index */ if (job->action == WRITECACHE) { /* DIRTY the cache block */ md_block[INDEX_TO_MD_BLOCK_OFFSET(dmc, job->index)].cache_state = (VALID | DIRTY); } else { /* job->action == WRITEDISK* */ /* un-DIRTY the cache block */ md_block[INDEX_TO_MD_BLOCK_OFFSET(dmc, job->index)].cache_state = VALID; } for (job = md_block_head->md_io_inprog ; job != NULL ; job = job->next) { dmc->flashcache_stats.md_write_batch++; if (job->action == WRITECACHE) { /* DIRTY the cache block */ md_block[INDEX_TO_MD_BLOCK_OFFSET(dmc, job->index)].cache_state = (VALID | DIRTY); } else { /* job->action == WRITEDISK* */ /* un-DIRTY the cache block */ md_block[INDEX_TO_MD_BLOCK_OFFSET(dmc, job->index)].cache_state = VALID; } } spin_unlock(&md_block_head->md_block_lock); spin_unlock_irq(&cache_set->set_spin_lock); where.bdev = dmc->cache_dev->bdev; where.count = MD_SECTORS_PER_BLOCK(dmc); where.sector = (1 + INDEX_TO_MD_BLOCK(dmc, orig_job->index)) * MD_SECTORS_PER_BLOCK(dmc); dmc->flashcache_stats.ssd_writes++; dmc->flashcache_stats.md_ssd_writes++; dm_io_async_bvec_pl(1, &where, WRITE, &orig_job->pl_base[0], flashcache_md_write_callback, orig_job); } void flashcache_md_write_done(struct kcached_job *job) { struct cache_c *dmc = job->dmc; struct cache_md_block_head *md_block_head; int index, orig_index = job->index; struct kcached_job *job_list; int error = job->error; struct kcached_job *next; struct cacheblock *cacheblk; int set; struct cache_set *cache_set; VERIFY(!in_interrupt()); VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || job->action == WRITEDISK_SYNC); flashcache_free_md_sector(job); job->md_block = NULL; md_block_head = &dmc->md_blocks_buf[INDEX_TO_MD_BLOCK(dmc, job->index)]; job_list = job; spin_lock_irq(&md_block_head->md_block_lock); job->next = md_block_head->md_io_inprog; md_block_head->md_io_inprog = NULL; spin_unlock_irq(&md_block_head->md_block_lock); for (job = job_list ; job != NULL ; job = next) { next = job->next; job->error = error; index = job->index; set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; cacheblk = &dmc->cache[index]; spin_lock_irq(&cache_set->set_spin_lock); if (job->action == WRITECACHE) { if (unlikely(dmc->sysctl_error_inject & WRITECACHE_MD_ERROR)) { job->error = -EIO; dmc->sysctl_error_inject &= ~WRITECACHE_MD_ERROR; } if (likely(job->error == 0)) { if ((cacheblk->cache_state & DIRTY) == 0) { cache_set->nr_dirty++; atomic_inc(&dmc->nr_dirty); } dmc->flashcache_stats.md_write_dirty++; cacheblk->cache_state |= DIRTY; } else dmc->flashcache_errors.ssd_write_errors++; flashcache_bio_endio(job->bio, job->error, dmc, &job->io_start_time); if (job->error || cacheblk->nr_queued > 0) { if (job->error) { DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu", job->error, cacheblk->dbn); } spin_unlock_irq(&cache_set->set_spin_lock); flashcache_do_pending(job); } else { cacheblk->cache_state &= ~BLOCK_IO_INPROG; spin_unlock_irq(&cache_set->set_spin_lock); flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } } else { int action = job->action; if (unlikely(dmc->sysctl_error_inject & WRITEDISK_MD_ERROR)) { job->error = -EIO; dmc->sysctl_error_inject &= ~WRITEDISK_MD_ERROR; } /* * If we have an error on a WRITEDISK*, no choice but to preserve the * dirty block in cache. Fail any IOs for this block that occurred while * the block was being cleaned. */ if (likely(job->error == 0)) { dmc->flashcache_stats.md_write_clean++; cacheblk->cache_state &= ~DIRTY; VERIFY(cache_set->nr_dirty > 0); VERIFY(atomic_read(&dmc->nr_dirty) > 0); cache_set->nr_dirty--; atomic_dec(&dmc->nr_dirty); } else dmc->flashcache_errors.ssd_write_errors++; VERIFY(cache_set->clean_inprog > 0); VERIFY(atomic_read(&dmc->clean_inprog) > 0); cache_set->clean_inprog--; atomic_dec(&dmc->clean_inprog); if (job->error || cacheblk->nr_queued > 0) { if (job->error) { DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu", job->error, cacheblk->dbn); } spin_unlock_irq(&cache_set->set_spin_lock); flashcache_do_pending(job); } else { cacheblk->cache_state &= ~BLOCK_IO_INPROG; spin_unlock_irq(&cache_set->set_spin_lock); flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } /* Kick off more cleanings */ if (action == WRITEDISK) flashcache_clean_set(dmc, set, 0); else flashcache_sync_blocks(dmc); dmc->flashcache_stats.cleanings++; if (action == WRITEDISK_SYNC) flashcache_update_sync_progress(dmc); } } cache_set = &dmc->cache_sets[orig_index / dmc->assoc]; spin_lock_irq(&cache_set->set_spin_lock); spin_lock(&md_block_head->md_block_lock); if (md_block_head->queued_updates != NULL) { /* peel off the first job from the pending queue and kick that off */ job = md_block_head->queued_updates; md_block_head->queued_updates = job->next; spin_unlock(&md_block_head->md_block_lock); job->next = NULL; spin_unlock_irq(&cache_set->set_spin_lock); VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || job->action == WRITEDISK_SYNC); flashcache_md_write_kickoff(job); } else { md_block_head->nr_in_prog = 0; spin_unlock(&md_block_head->md_block_lock); spin_unlock_irq(&cache_set->set_spin_lock); } } /* * Kick off a cache metadata update (called from workqueue). * Cache metadata update IOs to a given metadata sector are serialized using the * nr_in_prog bit in the md sector bufhead. * If a metadata IO is already in progress, we queue up incoming metadata updates * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we * cluster all these pending updates and do all of them as 1 flash write (that * logic is in md_write_kickoff), where it switches out the entire pending_jobs * list and does all of those updates as 1 ssd write. */ void flashcache_md_write(struct kcached_job *job) { struct cache_c *dmc = job->dmc; struct cache_md_block_head *md_block_head; unsigned long flags; VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || job->action == WRITEDISK_SYNC); md_block_head = &dmc->md_blocks_buf[INDEX_TO_MD_BLOCK(dmc, job->index)]; spin_lock_irqsave(&md_block_head->md_block_lock, flags); /* If a write is in progress for this metadata sector, queue this update up */ if (md_block_head->nr_in_prog != 0) { struct kcached_job **nodepp; /* A MD update is already in progress, queue this one up for later */ nodepp = &md_block_head->queued_updates; while (*nodepp != NULL) nodepp = &((*nodepp)->next); job->next = NULL; *nodepp = job; spin_unlock_irqrestore(&md_block_head->md_block_lock, flags); } else { md_block_head->nr_in_prog = 1; spin_unlock_irqrestore(&md_block_head->md_block_lock, flags); /* * Always push to a worker thread. If the driver has * a completion thread, we could end up deadlocking even * if the context would be safe enough to write from. * This could be executed from the context of an IO * completion thread. Kicking off the write from that * context could result in the IO completion thread * blocking (eg on memory allocation). That can easily * deadlock. */ push_md_io(job); schedule_work(&_kcached_wq); } } static void flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context) { struct kcached_job *job = (struct kcached_job *)context; struct cache_c *dmc = job->dmc; int index = job->index; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; VERIFY(!in_interrupt()); DPRINTK("kcopyd_callback: Index %d", index); VERIFY(job->bio == NULL); spin_lock_irq(&cache_set->set_spin_lock); VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY)); if (unlikely(dmc->sysctl_error_inject & KCOPYD_CALLBACK_ERROR)) { read_err = -EIO; dmc->sysctl_error_inject &= ~KCOPYD_CALLBACK_ERROR; } if (likely(read_err == 0 && write_err == 0)) { spin_unlock_irq(&cache_set->set_spin_lock); flashcache_md_write(job); } else { if (read_err) read_err = -EIO; if (write_err) write_err = -EIO; /* Disk write failed. We can not purge this block from flash */ DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", -read_err, -write_err, job->job_io_regions.disk.sector); VERIFY(cache_set->clean_inprog > 0); cache_set->clean_inprog--; VERIFY(atomic_read(&dmc->clean_inprog) > 0); atomic_dec(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); /* Set the error in the job and let do_pending() handle the error */ if (read_err) { dmc->flashcache_errors.ssd_read_errors++; job->error = read_err; } else { dmc->flashcache_errors.disk_write_errors++; job->error = write_err; } flashcache_do_pending(job); flashcache_clean_set(dmc, set, 0); /* Kick off more cleanings */ dmc->flashcache_stats.cleanings++; } } static void flashcache_dirty_writeback(struct cache_c *dmc, int index) { struct kcached_job *job; struct cacheblock *cacheblk = &dmc->cache[index]; int device_removal = 0; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; DPRINTK("flashcache_dirty_writeback: Index %d", index); spin_lock_irq(&cache_set->set_spin_lock); VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG); VERIFY(cacheblk->cache_state & DIRTY); cache_set->clean_inprog++; atomic_inc(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); job = new_kcached_job(dmc, NULL, index); if (unlikely(dmc->sysctl_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) { if (job) flashcache_free_cache_job(job); job = NULL; dmc->sysctl_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL; } /* * If the device is being removed, do not kick off any more cleanings. */ if (unlikely(atomic_read(&dmc->remove_in_prog))) { DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu", cacheblk->dbn); if (job) flashcache_free_cache_job(job); job = NULL; device_removal = 1; } if (unlikely(job == NULL)) { spin_lock_irq(&cache_set->set_spin_lock); cache_set->clean_inprog--; atomic_dec(&dmc->clean_inprog); flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); spin_unlock_irq(&cache_set->set_spin_lock); if (device_removal == 0) DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu", cacheblk->dbn); } else { job->bio = NULL; job->action = WRITEDISK; atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.ssd_reads++; dmc->flashcache_stats.disk_writes++; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) kcopyd_copy(flashcache_kcp_client, &job->job_io_regions.cache, 1, &job->job_io_regions.disk, 0, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) flashcache_kcopyd_callback, #else (kcopyd_notify_fn) flashcache_kcopyd_callback, #endif job); #else dm_kcopyd_copy(flashcache_kcp_client, &job->job_io_regions.cache, 1, &job->job_io_regions.disk, 0, (dm_kcopyd_notify_fn) flashcache_kcopyd_callback, (void *)job); #endif } } /* * This function encodes the background disk cleaning logic. * Background disk cleaning is triggered for 2 reasons. A) Dirty blocks are lying fallow in the set, making them good candidates for being cleaned. B) This set has dirty blocks over the configured threshold for a set. * (A) takes precedence over (B). Fallow dirty blocks are cleaned * first. * The cleaning of disk blocks is subject to the write limits per * set and across the cache, which this function enforces. * * 1) Select the n blocks that we want to clean (choosing whatever policy), * sort them. * 2) Then sweep the entire set looking for other DIRTY blocks that can be * tacked onto any of these blocks to form larger contigous writes. * The idea here is that if you are going to do a write anyway, then we * might as well opportunistically write out any contigous blocks for * free. */ /* Are we under the limits for disk cleaning ? */ static inline int flashcache_can_clean(struct cache_c *dmc, struct cache_set *cache_set, int nr_writes) { return ((cache_set->clean_inprog + nr_writes) < dmc->max_clean_ios_set && (nr_writes + atomic_read(&dmc->clean_inprog)) < dmc->max_clean_ios_total); } void flashcache_clean_set(struct cache_c *dmc, int set, int force_clean_blocks) { int threshold_clean = 0; struct dbn_index_pair *writes_list = NULL; struct dbn_index_pair *set_dirty_list = NULL; int nr_writes = 0, i; int start_index = set * dmc->assoc; int end_index = start_index + dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; struct cacheblock *cacheblk; int do_delayed_clean = 0; int scanned = 0; if (dmc->cache_mode != FLASHCACHE_WRITE_BACK) return; if (dmc->sysctl_reclaim_policy == FLASHCACHE_FIFO) /* * We only do force cleaning on a cache miss if reclaim policy * is LRU. */ force_clean_blocks = 0; /* * If a removal of this device is in progress, don't kick off * any more cleanings. This isn't sufficient though. We still need to * stop cleanings inside flashcache_dirty_writeback() because we could * have started a device remove after tested this here. */ if (atomic_read(&dmc->remove_in_prog)) return; if (flashcache_diskclean_alloc(dmc, &writes_list, &set_dirty_list)) { dmc->flashcache_errors.memory_alloc_errors++; return; } spin_lock_irq(&cache_set->set_spin_lock); /* * Before we try to clean any blocks, check the last time the fallow block * detection was done. If it has been more than "fallow_delay" seconds, make * a sweep through the set to detect (mark) fallow blocks. */ if (dmc->sysctl_fallow_delay && time_after(jiffies, cache_set->fallow_tstamp)) { for (i = start_index ; i < end_index ; i++) flashcache_detect_fallow(dmc, i); cache_set->fallow_tstamp = jiffies + dmc->sysctl_fallow_delay * HZ; } /* If there are any dirty fallow blocks, clean them first */ for (i = start_index ; (dmc->sysctl_fallow_delay > 0 && cache_set->dirty_fallow > 0 && time_after(jiffies, cache_set->fallow_next_cleaning) && i < end_index) ; i++) { cacheblk = &dmc->cache[i]; if (!(cacheblk->cache_state & DIRTY_FALLOW_2)) continue; if (!flashcache_can_clean(dmc, cache_set, nr_writes)) { /* * There are fallow blocks that need cleaning, but we * can't clean them this pass, schedule delayed cleaning * later. */ do_delayed_clean = 1; goto out; } VERIFY(cacheblk->cache_state & DIRTY); VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == 0); cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, i); writes_list[nr_writes].dbn = cacheblk->dbn; writes_list[nr_writes].index = i; dmc->flashcache_stats.fallow_cleanings++; nr_writes++; } if (nr_writes > 0) cache_set->fallow_next_cleaning = jiffies + HZ / dmc->sysctl_fallow_clean_speed; /* * In the miss path, we try to clean at least one block so the cache set does not * fill up with dirty fallow blocks. */ if (force_clean_blocks == 0) { if (cache_set->nr_dirty < dmc->dirty_thresh_set || !flashcache_can_clean(dmc, cache_set, nr_writes)) goto out; /* * We picked up all the dirty fallow blocks we can. We can still clean more to * remain under the dirty threshold. Clean some more blocks. */ threshold_clean = cache_set->nr_dirty - dmc->dirty_thresh_set; } else if (cache_set->nr_dirty > 0) { /* We want to clean at least 1 block - miss path */ if (cache_set->nr_dirty > dmc->dirty_thresh_set) { /* We can definitely clean some based on thresholds */ threshold_clean = cache_set->nr_dirty - dmc->dirty_thresh_set; force_clean_blocks = 0; } else if (nr_writes == 0) { /* XXX - Should be nr_writes < force_clean_blocks */ dmc->flashcache_stats.force_clean_block++; threshold_clean = force_clean_blocks; } } if (dmc->sysctl_reclaim_policy == FLASHCACHE_FIFO) { i = cache_set->set_clean_next; DPRINTK("flashcache_clean_set: Set %d", set); while (scanned < dmc->assoc && flashcache_can_clean(dmc, cache_set, nr_writes) && nr_writes < threshold_clean) { cacheblk = &dmc->cache[i]; if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, i); writes_list[nr_writes].dbn = cacheblk->dbn; writes_list[nr_writes].index = i; nr_writes++; } scanned++; i++; if (i == end_index) i = start_index; } cache_set->set_clean_next = i; } else { /* reclaim_policy == FLASHCACHE_LRU */ int lru_rel_index; int iter; for (iter = 0 ; iter < 2 ; iter++) { if (iter == 0) lru_rel_index = cache_set->warmlist_lru_head; else lru_rel_index = cache_set->hotlist_lru_head; while (lru_rel_index != FLASHCACHE_NULL && flashcache_can_clean(dmc, cache_set, nr_writes) && nr_writes < threshold_clean) { cacheblk = &dmc->cache[lru_rel_index + start_index]; if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, lru_rel_index + start_index); writes_list[nr_writes].dbn = cacheblk->dbn; writes_list[nr_writes].index = cacheblk - &dmc->cache[0]; nr_writes++; } scanned++; /* * If we are forced to clean on replacement, only clean blocks at * the tail end of the LRU list ! */ if (force_clean_blocks > 0 && scanned == force_clean_blocks) goto out; lru_rel_index = cacheblk->lru_next; } } } out: if (nr_writes > 0) { flashcache_merge_writes(dmc, writes_list, set_dirty_list, &nr_writes, set); dmc->flashcache_stats.clean_set_ios += nr_writes; if (nr_writes < FLASHCACHE_WRITE_CLUST_HIST_SIZE) dmc->write_clust_hist[nr_writes]++; else dmc->write_clust_hist_ovf++; spin_unlock_irq(&cache_set->set_spin_lock); /* * XXX - There are some subtle bugs in the flashcache_kcopy code * (leaked copy jobs). Until we fix those, revert to the original * logic of using the kernel kcopyd code. If you enable * flashcache_kcopy, enable the code in flashcache_kcopy_init(). */ #if 1 for (i = 0 ; i < nr_writes ; i++) flashcache_dirty_writeback(dmc, writes_list[i].index); #else flashcache_copy_data(dmc, cache_set, nr_writes, writes_list); #endif } else { if (cache_set->nr_dirty > dmc->dirty_thresh_set) do_delayed_clean = 1; spin_unlock_irq(&cache_set->set_spin_lock); if (do_delayed_clean) schedule_delayed_work(&dmc->delayed_clean, 1*HZ); } flashcache_diskclean_free(dmc, writes_list, set_dirty_list); } static void flashcache_read_hit(struct cache_c *dmc, struct bio* bio, int index) { struct cacheblock *cacheblk; struct pending_job *pjob; int set = index / dmc->assoc; cacheblk = &dmc->cache[index]; /* If block is busy, queue IO pending completion of in-progress IO */ if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->nr_queued == 0)) { struct kcached_job *job; cacheblk->cache_state |= CACHEREADINPROG; dmc->flashcache_stats.read_hits++; flashcache_setlocks_multidrop(dmc, bio); DPRINTK("Cache read: Block %llu(%lu), index = %d:%s", bio->bi_sector, bio->bi_size, index, "CACHE HIT"); job = new_kcached_job(dmc, bio, index); if (unlikely(dmc->sysctl_error_inject & READ_HIT_JOB_ALLOC_FAIL)) { if (job) flashcache_free_cache_job(job); job = NULL; dmc->sysctl_error_inject &= ~READ_HIT_JOB_ALLOC_FAIL; } if (unlikely(job == NULL)) { /* * We have a read hit, and can't allocate a job. * Since we dropped the spinlock, we have to drain any * pending jobs. */ DMERR("flashcache: Read (hit) failed ! Can't allocate memory for cache IO, block %lu", cacheblk->dbn); flashcache_bio_endio(bio, -EIO, dmc, NULL); spin_lock_irq(&dmc->cache_sets[set].set_spin_lock); flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); spin_unlock_irq(&dmc->cache_sets[set].set_spin_lock); } else { job->action = READCACHE; /* Fetch data from cache */ atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.ssd_reads++; dm_io_async_bvec(1, &job->job_io_regions.cache, READ, bio, flashcache_io_callback, job); } } else { pjob = flashcache_alloc_pending_job(dmc); if (unlikely(dmc->sysctl_error_inject & READ_HIT_PENDING_JOB_ALLOC_FAIL)) { if (pjob) { flashcache_free_pending_job(pjob); pjob = NULL; } dmc->sysctl_error_inject &= ~READ_HIT_PENDING_JOB_ALLOC_FAIL; } if (pjob == NULL) flashcache_bio_endio(bio, -EIO, dmc, NULL); else flashcache_enq_pending(dmc, bio, index, READCACHE, pjob); flashcache_setlocks_multidrop(dmc, bio); } } static void flashcache_read_miss(struct cache_c *dmc, struct bio* bio, int index) { struct kcached_job *job; struct cacheblock *cacheblk = &dmc->cache[index]; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; job = new_kcached_job(dmc, bio, index); if (unlikely(dmc->sysctl_error_inject & READ_MISS_JOB_ALLOC_FAIL)) { if (job) flashcache_free_cache_job(job); job = NULL; dmc->sysctl_error_inject &= ~READ_MISS_JOB_ALLOC_FAIL; } if (unlikely(job == NULL)) { /* * We have a read miss, and can't allocate a job. * Since we dropped the spinlock, we have to drain any * pending jobs. */ DMERR("flashcache: Read (miss) failed ! Can't allocate memory for cache IO, block %lu", cacheblk->dbn); flashcache_bio_endio(bio, -EIO, dmc, NULL); atomic_dec(&dmc->cached_blocks); spin_lock_irq(&cache_set->set_spin_lock); flashcache_hash_remove(dmc, index); cacheblk->cache_state &= ~VALID; cacheblk->cache_state |= INVALID; flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); flashcache_invalid_insert(dmc, index); spin_unlock_irq(&cache_set->set_spin_lock); } else { job->action = READDISK; /* Fetch data from the source device */ atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.disk_reads++; dm_io_async_bvec(1, &job->job_io_regions.disk, READ, bio, flashcache_io_callback, job); flashcache_clean_set(dmc, set, dmc->sysctl_clean_on_read_miss); } } static void flashcache_read(struct cache_c *dmc, struct bio *bio) { int index; int res; struct cacheblock *cacheblk; int queued; unsigned long flags; DPRINTK("Got a %s for %llu (%u bytes)", (bio_rw(bio) == READ ? "READ":"READA"), bio->bi_sector, bio->bi_size); flashcache_setlocks_multiget(dmc, bio); res = flashcache_lookup(dmc, bio, &index); /* Cache Read Hit case */ if (res > 0) { cacheblk = &dmc->cache[index]; if ((cacheblk->cache_state & VALID) && (cacheblk->dbn == bio->bi_sector)) { flashcache_read_hit(dmc, bio, index); return; } } /* * In all cases except for a cache hit (and VALID), test for potential * invalidations that we need to do. */ queued = flashcache_inval_blocks(dmc, bio); if (queued) { if (unlikely(queued < 0)) flashcache_bio_endio(bio, -EIO, dmc, NULL); if ((res > 0) && (dmc->cache[index].cache_state == INVALID)) /* * If happened to pick up an INVALID block, put it back on the * per cache-set invalid list */ flashcache_invalid_insert(dmc, index); flashcache_setlocks_multidrop(dmc, bio); return; } /* * Locking Note : * We are taking the ioctl_lock holding the cache set multilocks. * The ioctl lock is held for very short durations, and we do not * (and should not) try to acquire any other locks holding the ioctl * lock. */ spin_lock_irqsave(&dmc->ioctl_lock, flags); if (res == -1 || dmc->write_only_cache || flashcache_uncacheable(dmc, bio)) { spin_unlock_irqrestore(&dmc->ioctl_lock, flags); /* No room , non-cacheable or sequential i/o means not wanted in cache */ if ((res > 0) && (dmc->cache[index].cache_state == INVALID)) /* * If happened to pick up an INVALID block, put it back on the * per cache-set invalid list */ flashcache_invalid_insert(dmc, index); flashcache_setlocks_multidrop(dmc, bio); DPRINTK("Cache read: Block %llu(%lu):%s", bio->bi_sector, bio->bi_size, "CACHE MISS & NO ROOM"); if (res == -1) flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector), 0); /* Start uncached IO */ flashcache_start_uncached_io(dmc, bio); return; } else spin_unlock_irqrestore(&dmc->ioctl_lock, flags); /* * (res == INVALID) Cache Miss * And we found cache blocks to replace * Claim the cache blocks before giving up the spinlock */ if (dmc->cache[index].cache_state & VALID) { dmc->flashcache_stats.replace++; /* * We are switching the block's identity. Remove it from * the existing hash queue and re-insert it into a new one * below, after switching it to the new identity. */ flashcache_hash_remove(dmc, index); } else atomic_inc(&dmc->cached_blocks); dmc->cache[index].cache_state = VALID | DISKREADINPROG; dmc->cache[index].dbn = bio->bi_sector; flashcache_hash_insert(dmc, index); flashcache_setlocks_multidrop(dmc, bio); DPRINTK("Cache read: Block %llu(%lu), index = %d:%s", bio->bi_sector, bio->bi_size, index, "CACHE MISS & REPLACE"); flashcache_read_miss(dmc, bio, index); } /* * Invalidation might require to grab locks on 2 cache sets. * To prevent Lock Order Reversals (and deadlocks), always grab * the cache set locks in ascending order. */ static void flashcache_setlocks_multiget(struct cache_c *dmc, struct bio *bio) { int start_set = hash_block(dmc, bio->bi_sector); int end_set = hash_block(dmc, bio->bi_sector + (to_sector(bio->bi_size) - 1)); VERIFY(!in_interrupt()); spin_lock_irq(&dmc->cache_sets[start_set].set_spin_lock); if (start_set != end_set) spin_lock(&dmc->cache_sets[end_set].set_spin_lock); } static void flashcache_setlocks_multidrop(struct cache_c *dmc, struct bio *bio) { int start_set = hash_block(dmc, bio->bi_sector); int end_set = hash_block(dmc, bio->bi_sector + (to_sector(bio->bi_size) - 1)); VERIFY(!in_interrupt()); if (start_set != end_set) spin_unlock(&dmc->cache_sets[end_set].set_spin_lock); spin_unlock_irq(&dmc->cache_sets[start_set].set_spin_lock); } /* * Invalidate any colliding blocks if they are !BUSY and !DIRTY. If the colliding * block is DIRTY, we need to kick off a write. In both cases, we need to wait * until the underlying IO is finished, and then proceed with the invalidation. */ static int flashcache_inval_block_set(struct cache_c *dmc, int set, struct bio *bio, int rw, struct pending_job *pjob) { sector_t io_start = bio->bi_sector; sector_t io_end = bio->bi_sector + (to_sector(bio->bi_size) - 1); int start_index, end_index, i; struct cacheblock *cacheblk; start_index = dmc->assoc * set; end_index = start_index + dmc->assoc; for (i = start_index ; i < end_index ; i++) { sector_t start_dbn = dmc->cache[i].dbn; sector_t end_dbn = start_dbn + dmc->block_size; cacheblk = &dmc->cache[i]; if (cacheblk->cache_state & INVALID) continue; if ((io_start >= start_dbn && io_start < end_dbn) || (io_end >= start_dbn && io_end < end_dbn)) { /* We have a match */ if (rw == WRITE) dmc->flashcache_stats.wr_invalidates++; else dmc->flashcache_stats.rd_invalidates++; if (!(cacheblk->cache_state & (BLOCK_IO_INPROG | DIRTY)) && (cacheblk->nr_queued == 0)) { atomic_dec(&dmc->cached_blocks); DPRINTK("Cache invalidate (!BUSY): Block %llu %lx", start_dbn, cacheblk->cache_state); flashcache_hash_remove(dmc, i); cacheblk->cache_state = INVALID; flashcache_invalid_insert(dmc, i); continue; } /* * The conflicting block has either IO in progress or is * Dirty. In all cases, we need to add ourselves to the * pending queue. Then if the block is dirty, we kick off * an IO to clean the block. * Note that if the block is dirty and IO is in progress * on it, the do_pending handler will clean the block * and then process the pending queue. */ flashcache_enq_pending(dmc, bio, i, INVALIDATE, pjob); if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { /* * Kick off block write. * We can't kick off the write under the spinlock. * Instead, we mark the slot DISKWRITEINPROG, drop * the spinlock and kick off the write. A block marked * DISKWRITEINPROG cannot change underneath us. * to enqueue ourselves onto it's pending queue. * * XXX - The dropping of the lock here can be avoided if * we punt the cleaning of the block to the worker thread, * at the cost of a context switch. */ cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, i); flashcache_setlocks_multidrop(dmc, bio); flashcache_dirty_writeback(dmc, i); /* Must inc nr_jobs */ flashcache_setlocks_multiget(dmc, bio); } return 1; } } return 0; } #if 0 static int flashcache_inval_block_set_v3_checks(struct cache_c *dmc, int set, struct bio *bio) { sector_t io_start = bio->bi_sector; sector_t io_end = bio->bi_sector + (to_sector(bio->bi_size) - 1); int start_index, end_index, i; struct cacheblock *cacheblk; start_index = dmc->assoc * set; end_index = start_index + dmc->assoc; for (i = start_index ; i < end_index ; i++) { sector_t start_dbn; sector_t end_dbn; cacheblk = &dmc->cache[i]; start_dbn = cacheblk->dbn; end_dbn = start_dbn + dmc->block_size; if (cacheblk->cache_state & INVALID) continue; if ((io_start >= start_dbn && io_start < end_dbn) || (io_end >= start_dbn && io_end < end_dbn)) { return i; } } return -1; } #endif static int flashcache_inval_block_set_v3(struct cache_c *dmc, int set, struct bio *bio, struct pending_job *pjob) { int index; struct cacheblock *cacheblk; int rw = bio_data_dir(bio); sector_t io_start; sector_t mask; mask = ~((1 << dmc->block_shift) - 1); io_start = bio->bi_sector & mask; /* Check in per-set hash to see if the overlapping block exists in cache */ index = flashcache_hash_lookup(dmc, set, io_start); if (index == -1) { #if 0 index = flashcache_inval_block_set_v3_checks(dmc, set, bio); if (index != -1) { printk(KERN_ERR "Invalidate: Did not find block on hash " "but found in set %d\n", index); printk(KERN_ERR "io_start = %lu bi_sector = %lu bi_end = %lu\n", io_start, bio->bi_sector, bio->bi_sector + (to_sector(bio->bi_size) - 1)); printk(KERN_ERR "cache_state = %x hash_state = %x cacheblk->dbn = %lu\n", dmc->cache[index].cache_state, dmc->cache[index].hash_state, dmc->cache[index].dbn); VERIFY(0); } #endif return 0; } cacheblk = &dmc->cache[index]; VERIFY(cacheblk->cache_state & VALID); /* We have a match */ if (rw == WRITE) { dmc->flashcache_stats.wr_invalidates++; } else { dmc->flashcache_stats.rd_invalidates++; } if (!(cacheblk->cache_state & (BLOCK_IO_INPROG | DIRTY)) && (cacheblk->nr_queued == 0)) { atomic_dec(&dmc->cached_blocks); DPRINTK("Cache invalidate (!BUSY): Block %llu %lx", start_dbn, cacheblk->cache_state); flashcache_hash_remove(dmc, index); cacheblk->cache_state = INVALID; flashcache_invalid_insert(dmc, index); return 0; } /* * The conflicting block has either IO in progress or is * Dirty. In all cases, we need to add ourselves to the * pending queue. Then if the block is dirty, we kick off * an IO to clean the block. * Note that if the block is dirty and IO is in progress * on it, the do_pending handler will clean the block * and then process the pending queue. */ flashcache_enq_pending(dmc, bio, index, INVALIDATE, pjob); if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { /* * Kick off block write. * We can't kick off the write under the spinlock. * Instead, we mark the slot DISKWRITEINPROG, drop * the spinlock and kick off the write. A block marked * DISKWRITEINPROG cannot change underneath us. * to enqueue ourselves onto it's pending queue. * * XXX - The dropping of the lock here can be avoided if * we punt the cleaning of the block to the worker thread, * at the cost of a context switch. */ cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, index); flashcache_setlocks_multidrop(dmc, bio); flashcache_dirty_writeback(dmc, index); /* Must inc nr_jobs */ flashcache_setlocks_multiget(dmc, bio); } return 1; } static int flashcache_inval_blocks(struct cache_c *dmc, struct bio *bio) { sector_t io_start; sector_t io_end; int start_set, end_set; int queued; struct pending_job *pjob1, *pjob2; sector_t mask; pjob1 = flashcache_alloc_pending_job(dmc); if (unlikely(dmc->sysctl_error_inject & INVAL_PENDING_JOB_ALLOC_FAIL)) { if (pjob1) { flashcache_free_pending_job(pjob1); pjob1 = NULL; } dmc->sysctl_error_inject &= ~INVAL_PENDING_JOB_ALLOC_FAIL; } if (pjob1 == NULL) { queued = -ENOMEM; goto out; } /* If the on-ssd cache version is < 3, we revert to old style invalidations ! */ if (dmc->on_ssd_version < 3) { pjob2 = flashcache_alloc_pending_job(dmc); if (pjob2 == NULL) { flashcache_free_pending_job(pjob1); queued = -ENOMEM; goto out; } io_start = bio->bi_sector; io_end = (bio->bi_sector + (to_sector(bio->bi_size) - 1)); start_set = hash_block(dmc, io_start); end_set = hash_block(dmc, io_end); VERIFY(spin_is_locked(&dmc->cache_sets[start_set].set_spin_lock)); if (start_set != end_set) VERIFY(spin_is_locked(&dmc->cache_sets[end_set].set_spin_lock)); queued = flashcache_inval_block_set(dmc, start_set, bio, bio_data_dir(bio), pjob1); if (queued) { flashcache_free_pending_job(pjob2); goto out; } else flashcache_free_pending_job(pjob1); if (start_set != end_set) { queued = flashcache_inval_block_set(dmc, end_set, bio, bio_data_dir(bio), pjob2); if (!queued) flashcache_free_pending_job(pjob2); } else flashcache_free_pending_job(pjob2); } else { /* * Assume a 4KB blocksize. * Knowns : * 1) DM will break up IOs at 4KB boundaries. * 2) Flashcache will only cache *exactly* 4KB IOs. * Conclusion : * Flashcache will only cache an IO that begins exactly at a 4KB * boundary and at a 4KB length ! * The incoming IO might be a smaller than 4KB IO, where bi_sector * is NOT 4KB aligned or bi_size < 4KB * To check for overlaps, we simply need to check if the 4KB block * that [bi_sector, bi_sector + bi_size] overlaps with a block that * is in the cache. */ mask = ~((1 << dmc->block_shift) - 1); io_start = bio->bi_sector & mask; start_set = hash_block(dmc, io_start); VERIFY(spin_is_locked(&dmc->cache_sets[start_set].set_spin_lock)); queued = flashcache_inval_block_set_v3(dmc, start_set, bio, pjob1); if (queued) { goto out; } else flashcache_free_pending_job(pjob1); } out: return queued; } static void flashcache_write_miss(struct cache_c *dmc, struct bio *bio, int index) { struct cacheblock *cacheblk; struct kcached_job *job; int queued; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; cacheblk = &dmc->cache[index]; queued = flashcache_inval_blocks(dmc, bio); if (queued) { if (cacheblk->cache_state == INVALID) /* * If happened to pick up an INVALID block, put it back on the * per cache-set invalid list */ flashcache_invalid_insert(dmc, index); flashcache_setlocks_multidrop(dmc, bio); if (unlikely(queued < 0)) flashcache_bio_endio(bio, -EIO, dmc, NULL); return; } if (cacheblk->cache_state & VALID) { dmc->flashcache_stats.wr_replace++; /* * We are switching the block's identity. Remove it from * the existing hash queue and re-insert it into a new one * below, after switching it to the new identity. */ flashcache_hash_remove(dmc, index); } else atomic_inc(&dmc->cached_blocks); cacheblk->cache_state = VALID | CACHEWRITEINPROG; cacheblk->dbn = bio->bi_sector; flashcache_hash_insert(dmc, index); flashcache_setlocks_multidrop(dmc, bio); job = new_kcached_job(dmc, bio, index); if (unlikely(dmc->sysctl_error_inject & WRITE_MISS_JOB_ALLOC_FAIL)) { if (job) flashcache_free_cache_job(job); job = NULL; dmc->sysctl_error_inject &= ~WRITE_MISS_JOB_ALLOC_FAIL; } if (unlikely(job == NULL)) { /* * We have a write miss, and can't allocate a job. * Since we dropped the spinlock, we have to drain any * pending jobs. */ DMERR("flashcache: Write (miss) failed ! Can't allocate memory for cache IO, block %lu", cacheblk->dbn); flashcache_bio_endio(bio, -EIO, dmc, NULL); atomic_dec(&dmc->cached_blocks); spin_lock_irq(&cache_set->set_spin_lock); flashcache_hash_remove(dmc, index); cacheblk->cache_state &= ~VALID; cacheblk->cache_state |= INVALID; flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); flashcache_invalid_insert(dmc, index); spin_unlock_irq(&cache_set->set_spin_lock); } else { atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.ssd_writes++; job->action = WRITECACHE; if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { /* Write data to the cache */ dm_io_async_bvec(1, &job->job_io_regions.cache, WRITE, bio, flashcache_io_callback, job); } else { VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_THROUGH); /* Write data to both disk and cache */ dm_io_async_bvec(2, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) (struct io_region *)&job->job_io_regions, #else (struct dm_io_region *)&job->job_io_regions, #endif WRITE, bio, flashcache_io_callback, job); } flashcache_clean_set(dmc, set, dmc->sysctl_clean_on_write_miss); } } static void flashcache_write_hit(struct cache_c *dmc, struct bio *bio, int index) { struct cacheblock *cacheblk; struct pending_job *pjob; struct kcached_job *job; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; cacheblk = &dmc->cache[index]; if (!(cacheblk->cache_state & BLOCK_IO_INPROG) && (cacheblk->nr_queued == 0)) { if (cacheblk->cache_state & DIRTY) dmc->flashcache_stats.dirty_write_hits++; dmc->flashcache_stats.write_hits++; cacheblk->cache_state |= CACHEWRITEINPROG; flashcache_setlocks_multidrop(dmc, bio); job = new_kcached_job(dmc, bio, index); if (unlikely(dmc->sysctl_error_inject & WRITE_HIT_JOB_ALLOC_FAIL)) { if (job) flashcache_free_cache_job(job); job = NULL; dmc->sysctl_error_inject &= ~WRITE_HIT_JOB_ALLOC_FAIL; } if (unlikely(job == NULL)) { /* * We have a write hit, and can't allocate a job. * Since we dropped the spinlock, we have to drain any * pending jobs. */ DMERR("flashcache: Write (hit) failed ! Can't allocate memory for cache IO, block %lu", cacheblk->dbn); flashcache_bio_endio(bio, -EIO, dmc, NULL); spin_lock_irq(&cache_set->set_spin_lock); flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); spin_unlock_irq(&cache_set->set_spin_lock); } else { DPRINTK("Queue job for %llu", bio->bi_sector); atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.ssd_writes++; job->action = WRITECACHE; if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { /* Write data to the cache */ dm_io_async_bvec(1, &job->job_io_regions.cache, WRITE, bio, flashcache_io_callback, job); flashcache_clean_set(dmc, index / dmc->assoc, 0); } else { VERIFY(dmc->cache_mode == FLASHCACHE_WRITE_THROUGH); /* Write data to both disk and cache */ dmc->flashcache_stats.disk_writes++; dm_io_async_bvec(2, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) (struct io_region *)&job->job_io_regions, #else (struct dm_io_region *)&job->job_io_regions, #endif WRITE, bio, flashcache_io_callback, job); } } } else { pjob = flashcache_alloc_pending_job(dmc); if (unlikely(dmc->sysctl_error_inject & WRITE_HIT_PENDING_JOB_ALLOC_FAIL)) { if (pjob) { flashcache_free_pending_job(pjob); pjob = NULL; } dmc->sysctl_error_inject &= ~WRITE_HIT_PENDING_JOB_ALLOC_FAIL; } if (unlikely(pjob == NULL)) flashcache_bio_endio(bio, -EIO, dmc, NULL); else flashcache_enq_pending(dmc, bio, index, WRITECACHE, pjob); flashcache_setlocks_multidrop(dmc, bio); } } static void flashcache_write(struct cache_c *dmc, struct bio *bio) { int index; int res; struct cacheblock *cacheblk; int queued; flashcache_setlocks_multiget(dmc, bio); res = flashcache_lookup(dmc, bio, &index); if (res != -1) { /* Cache Hit */ cacheblk = &dmc->cache[index]; if ((cacheblk->cache_state & VALID) && (cacheblk->dbn == bio->bi_sector)) { /* Cache Hit */ flashcache_write_hit(dmc, bio, index); } else { /* Cache Miss, found block to recycle */ flashcache_write_miss(dmc, bio, index); } return; } /* * No room in the set. We cannot write to the cache and have to * send the request to disk. Before we do that, we must check * for potential invalidations ! */ queued = flashcache_inval_blocks(dmc, bio); flashcache_setlocks_multidrop(dmc, bio); if (queued) { if (unlikely(queued < 0)) flashcache_bio_endio(bio, -EIO, dmc, NULL); return; } /* Start uncached IO */ flashcache_start_uncached_io(dmc, bio); flashcache_clean_set(dmc, hash_block(dmc, bio->bi_sector), 0); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) #else #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) #define bio_barrier(bio) ((bio)->bi_rw & REQ_HARDBARRIER) #else #define bio_barrier(bio) ((bio)->bi_rw & REQ_FLUSH) #endif #endif #endif static void flashcache_do_block_checks(struct cache_c *dmc, struct bio *bio) { sector_t mask; sector_t io_start; sector_t io_end; VERIFY(to_sector(bio->bi_size) <= dmc->block_size); mask = ~((1 << dmc->block_shift) - 1); io_start = bio->bi_sector & mask; io_end = (bio->bi_sector + (to_sector(bio->bi_size) - 1)) & mask; /* The incoming bio must NOT straddle a blocksize boundary */ VERIFY(io_start == io_end); } /* * Decide the mapping and perform necessary cache operations for a bio request. */ int #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) flashcache_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) #else flashcache_map(struct dm_target *ti, struct bio *bio) #endif { struct cache_c *dmc = (struct cache_c *) ti->private; int sectors = to_sector(bio->bi_size); int queued; int uncacheable; unsigned long flags; if (sectors <= 32) size_hist[sectors]++; if (bio_barrier(bio)) return -EOPNOTSUPP; /* * Basic check to make sure blocks coming in are as we * expect them to be. */ flashcache_do_block_checks(dmc, bio); if (bio_data_dir(bio) == READ) dmc->flashcache_stats.reads++; else dmc->flashcache_stats.writes++; spin_lock_irqsave(&dmc->ioctl_lock, flags); if (unlikely(dmc->sysctl_pid_do_expiry && (dmc->whitelist_head || dmc->blacklist_head))) flashcache_pid_expiry_all_locked(dmc); uncacheable = (unlikely(dmc->bypass_cache) || (to_sector(bio->bi_size) != dmc->block_size) || /* * If the op is a READ, we serve it out of cache whenever possible, * regardless of cacheablity */ (bio_data_dir(bio) == WRITE && ((dmc->cache_mode == FLASHCACHE_WRITE_AROUND) || flashcache_uncacheable(dmc, bio)))); spin_unlock_irqrestore(&dmc->ioctl_lock, flags); if (uncacheable) { flashcache_setlocks_multiget(dmc, bio); queued = flashcache_inval_blocks(dmc, bio); flashcache_setlocks_multidrop(dmc, bio); if (queued) { if (unlikely(queued < 0)) flashcache_bio_endio(bio, -EIO, dmc, NULL); } else { /* Start uncached IO */ flashcache_start_uncached_io(dmc, bio); } } else { if (bio_data_dir(bio) == READ) flashcache_read(dmc, bio); else flashcache_write(dmc, bio); } return DM_MAPIO_SUBMITTED; } /* Block sync support functions */ static void flashcache_kcopyd_callback_sync(int read_err, unsigned int write_err, void *context) { struct kcached_job *job = (struct kcached_job *)context; struct cache_c *dmc = job->dmc; int index = job->index; struct cache_set *cache_set = &dmc->cache_sets[index / dmc->assoc]; VERIFY(!in_interrupt()); DPRINTK("kcopyd_callback_sync: Index %d", index); VERIFY(job->bio == NULL); spin_lock_irq(&cache_set->set_spin_lock); VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY)); if (likely(read_err == 0 && write_err == 0)) { spin_unlock_irq(&cache_set->set_spin_lock); flashcache_md_write(job); } else { if (read_err) read_err = -EIO; if (write_err) write_err = -EIO; /* Disk write failed. We can not purge this cache from flash */ DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", -read_err, -write_err, job->job_io_regions.disk.sector); VERIFY(cache_set->clean_inprog > 0); VERIFY(atomic_read(&dmc->clean_inprog) > 0); cache_set->clean_inprog--; atomic_dec(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); /* Set the error in the job and let do_pending() handle the error */ if (read_err) { dmc->flashcache_errors.ssd_read_errors++; job->error = read_err; } else { dmc->flashcache_errors.disk_write_errors++; job->error = write_err; } flashcache_do_pending(job); flashcache_sync_blocks(dmc); /* Kick off more cleanings */ dmc->flashcache_stats.cleanings++; } } static void flashcache_dirty_writeback_sync(struct cache_c *dmc, int index) { struct kcached_job *job; struct cacheblock *cacheblk = &dmc->cache[index]; int device_removal = 0; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; VERIFY((cacheblk->cache_state & FALLOW_DOCLEAN) == 0); DPRINTK("flashcache_dirty_writeback_sync: Index %d", index); spin_lock_irq(&cache_set->set_spin_lock); VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG); VERIFY(cacheblk->cache_state & DIRTY); cache_set->clean_inprog++; atomic_inc(&dmc->clean_inprog); spin_unlock_irq(&cache_set->set_spin_lock); job = new_kcached_job(dmc, NULL, index); /* * If the device is being (fast) removed, do not kick off any more cleanings. */ if (unlikely(atomic_read(&dmc->remove_in_prog) == FAST_REMOVE)) { DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu", cacheblk->dbn); if (job) flashcache_free_cache_job(job); job = NULL; device_removal = 1; } if (unlikely(job == NULL)) { spin_lock_irq(&cache_set->set_spin_lock); cache_set->clean_inprog--; atomic_dec(&dmc->clean_inprog); flashcache_free_pending_jobs(dmc, cacheblk, -EIO); cacheblk->cache_state &= ~(BLOCK_IO_INPROG); spin_unlock_irq(&cache_set->set_spin_lock); if (device_removal == 0) DMERR("flashcache: Dirty Writeback (for sync) failed ! Can't allocate memory, block %lu", cacheblk->dbn); } else { job->bio = NULL; job->action = WRITEDISK_SYNC; atomic_inc(&dmc->nr_jobs); dmc->flashcache_stats.ssd_reads++; dmc->flashcache_stats.disk_writes++; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) kcopyd_copy(flashcache_kcp_client, &job->job_io_regions.cache, 1, &job->job_io_regions.disk, 0, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) flashcache_kcopyd_callback_sync, #else (kcopyd_notify_fn) flashcache_kcopyd_callback_sync, #endif job); #else dm_kcopyd_copy(flashcache_kcp_client, &job->job_io_regions.cache, 1, &job->job_io_regions.disk, 0, (dm_kcopyd_notify_fn)flashcache_kcopyd_callback_sync, (void *)job); #endif } } /* * Sync all dirty blocks. We pick off dirty blocks, sort them, merge them with * any contigous blocks we can within the set and fire off the writes. */ void flashcache_sync_blocks(struct cache_c *dmc) { int index; struct dbn_index_pair *writes_list = NULL; struct dbn_index_pair *set_dirty_list = NULL; int nr_writes; int i, set; struct cacheblock *cacheblk; struct cache_set *cache_set; /* * If a (fast) removal of this device is in progress, don't kick off * any more cleanings. This isn't sufficient though. We still need to * stop cleanings inside flashcache_dirty_writeback_sync() because we could * have started a device remove after tested this here. */ if ((atomic_read(&dmc->remove_in_prog) == FAST_REMOVE) || dmc->sysctl_stop_sync) return; if (atomic_read(&dmc->nr_dirty) == 0 || !(atomic_read(&dmc->sync_index) < dmc->size)) /* Processed everything ? */ return; if (flashcache_diskclean_alloc(dmc, &writes_list, &set_dirty_list)) { dmc->flashcache_errors.memory_alloc_errors++; return; } nr_writes = 0; set = -1; index = atomic_read(&dmc->sync_index); set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; spin_lock_irq(&cache_set->set_spin_lock); while (index < dmc->size && (nr_writes + atomic_read(&dmc->clean_inprog)) < dmc->max_clean_ios_total) { VERIFY(nr_writes <= dmc->assoc); if ((index % dmc->assoc) == 0) { if (nr_writes > 0) { /* * Crossing a set, sort/merge all the IOs collected so * far and issue the writes. */ flashcache_merge_writes(dmc, writes_list, set_dirty_list, &nr_writes, set); spin_unlock_irq(&cache_set->set_spin_lock); for (i = 0 ; i < nr_writes ; i++) flashcache_dirty_writeback_sync(dmc, writes_list[i].index); nr_writes = 0; } else spin_unlock_irq(&cache_set->set_spin_lock); set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; spin_lock_irq(&cache_set->set_spin_lock); } cacheblk = &dmc->cache[index]; if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, index); writes_list[nr_writes].dbn = cacheblk->dbn; writes_list[nr_writes].index = index; nr_writes++; } index++; } atomic_set(&dmc->sync_index, index); if (nr_writes > 0) { VERIFY(set != -1); flashcache_merge_writes(dmc, writes_list, set_dirty_list, &nr_writes, set); spin_unlock_irq(&cache_set->set_spin_lock); for (i = 0 ; i < nr_writes ; i++) flashcache_dirty_writeback_sync(dmc, writes_list[i].index); } else spin_unlock_irq(&cache_set->set_spin_lock); flashcache_diskclean_free(dmc, writes_list, set_dirty_list); } void flashcache_sync_all(struct cache_c *dmc) { if (dmc->cache_mode != FLASHCACHE_WRITE_BACK) return; dmc->sysctl_stop_sync = 0; atomic_set(&dmc->sync_index, 0); flashcache_sync_blocks(dmc); } /* * We handle uncached IOs ourselves to deal with the problem of out of ordered * IOs corrupting the cache. Consider the case where we get 2 concurent IOs * for the same block Write-Read (or a Write-Write). Consider the case where * the first Write is uncacheable and the second IO is cacheable. If the * 2 IOs are out-of-ordered below flashcache, then we will cache inconsistent * data in flashcache (persistently). * * We do invalidations before launching uncacheable IOs to disk. But in case * of out of ordering the invalidations before launching the IOs does not help. * We need to invalidate after the IO completes. * * Doing invalidations after the completion of an uncacheable IO will cause * any overlapping dirty blocks in the cache to be written out and the IO * relaunched. If the overlapping blocks are busy, the IO is relaunched to * disk also (post invalidation). In these 2 cases, we will end up sending * 2 disk IOs for the block. But this is a rare case. * * When 2 IOs for the same block are sent down (by un co-operating processes) * the storage stack is allowed to re-order the IOs at will. So the applications * cannot expect any ordering at all. * * What we try to avoid here is inconsistencies between disk and the ssd cache. */ void flashcache_uncached_io_complete(struct kcached_job *job) { struct cache_c *dmc = job->dmc; int queued; int error = job->error; struct bio *bio = job->bio; if (unlikely(error)) { DMERR("flashcache uncached disk IO error: io error %d block %lu R/w %s", error, job->job_io_regions.disk.sector, (bio_data_dir(bio) == WRITE) ? "WRITE" : "READ"); if (bio_data_dir(bio) == WRITE) dmc->flashcache_errors.disk_write_errors++; else dmc->flashcache_errors.disk_read_errors++; } flashcache_setlocks_multiget(dmc, bio); queued = flashcache_inval_blocks(dmc, bio); flashcache_setlocks_multidrop(dmc, bio); if (queued) { if (unlikely(queued < 0)) flashcache_bio_endio(bio, -EIO, dmc, NULL); /* * The IO will be re-executed. * The do_pending logic will re-launch the * disk IO post-invalidation calling start_uncached_io. * This should be a rare occurrence. */ dmc->flashcache_stats.uncached_io_requeue++; } else { flashcache_bio_endio(bio, error, dmc, &job->io_start_time); } flashcache_free_cache_job(job); if (atomic_dec_and_test(&dmc->nr_jobs)) wake_up(&dmc->destroyq); } static void flashcache_uncached_io_callback(unsigned long error, void *context) { struct kcached_job *job = (struct kcached_job *) context; VERIFY(job->index == -1); if (unlikely(error)) job->error = -EIO; else job->error = 0; push_uncached_io_complete(job); schedule_work(&_kcached_wq); } static void flashcache_start_uncached_io(struct cache_c *dmc, struct bio *bio) { int is_write = (bio_data_dir(bio) == WRITE); struct kcached_job *job; if (is_write) { dmc->flashcache_stats.uncached_writes++; dmc->flashcache_stats.disk_writes++; } else { dmc->flashcache_stats.uncached_reads++; dmc->flashcache_stats.disk_reads++; } job = new_kcached_job(dmc, bio, -1); if (unlikely(job == NULL)) { flashcache_bio_endio(bio, -EIO, dmc, NULL); return; } atomic_inc(&dmc->nr_jobs); dm_io_async_bvec(1, &job->job_io_regions.disk, ((is_write) ? WRITE : READ), bio, flashcache_uncached_io_callback, job); } EXPORT_SYMBOL(flashcache_io_callback); EXPORT_SYMBOL(flashcache_do_pending_error); EXPORT_SYMBOL(flashcache_do_pending_noerror); EXPORT_SYMBOL(flashcache_do_pending); EXPORT_SYMBOL(flashcache_do_io); EXPORT_SYMBOL(flashcache_map); EXPORT_SYMBOL(flashcache_write); EXPORT_SYMBOL(flashcache_inval_blocks); EXPORT_SYMBOL(flashcache_inval_block_set); EXPORT_SYMBOL(flashcache_read); EXPORT_SYMBOL(flashcache_read_miss); EXPORT_SYMBOL(flashcache_clean_set); EXPORT_SYMBOL(flashcache_dirty_writeback); EXPORT_SYMBOL(flashcache_kcopyd_callback); EXPORT_SYMBOL(flashcache_lookup); EXPORT_SYMBOL(flashcache_alloc_md_sector); EXPORT_SYMBOL(flashcache_free_md_sector); EXPORT_SYMBOL(flashcache_md_write_callback); EXPORT_SYMBOL(flashcache_md_write_kickoff); EXPORT_SYMBOL(flashcache_md_write_done); EXPORT_SYMBOL(flashcache_md_write); EXPORT_SYMBOL(hash_block); flashcache-3.1.3+git20150701/src/flashcache_procfs.c000066400000000000000000001055501254507146700216070ustar00rootroot00000000000000/**************************************************************************** * flashcache_conf.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" #include "flashcache_ioctl.h" static int fallow_clean_speed_min = FALLOW_SPEED_MIN; static int fallow_clean_speed_max = FALLOW_SPEED_MAX; extern u_int64_t size_hist[]; static char *flashcache_cons_procfs_cachename(struct cache_c *dmc, char *path_component); static char *flashcache_cons_sysctl_devname(struct cache_c *dmc); #define FLASHCACHE_PROC_ROOTDIR_NAME "flashcache" static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_io_latency_init(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_io_latency_init(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) { if (dmc->sysctl_io_latency_hist) { int i; for (i = 0 ; i < IO_LATENCY_BUCKETS ; i++) dmc->latency_hist[i] = 0; dmc->latency_hist_10ms = 0; } } return 0; } static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_sync_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_sync_sysctl(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) { if (dmc->sysctl_do_sync) { dmc->sysctl_stop_sync = 0; cancel_delayed_work(&dmc->delayed_clean); flush_scheduled_work(); flashcache_sync_all(dmc); } } return 0; } static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_zerostats_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_zerostats_sysctl(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) { if (dmc->sysctl_zerostats) { int i; memset(&dmc->flashcache_stats, 0, sizeof(struct flashcache_stats)); for (i = 0 ; i < IO_LATENCY_BUCKETS ; i++) dmc->latency_hist[i] = 0; dmc->latency_hist_10ms = 0; } } return 0; } static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_fallow_clean_speed_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_fallow_clean_speed_sysctl(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) { if (dmc->sysctl_fallow_clean_speed < fallow_clean_speed_min) dmc->sysctl_fallow_clean_speed = fallow_clean_speed_min; if (dmc->sysctl_fallow_clean_speed > fallow_clean_speed_max) dmc->sysctl_fallow_clean_speed = fallow_clean_speed_max; } return 0; } static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_dirty_thresh_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_dirty_thresh_sysctl(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) { if (dmc->sysctl_dirty_thresh > DIRTY_THRESH_MAX) dmc->sysctl_dirty_thresh = DIRTY_THRESH_MAX; if (dmc->sysctl_dirty_thresh < DIRTY_THRESH_MIN) dmc->sysctl_dirty_thresh = DIRTY_THRESH_MIN; dmc->dirty_thresh_set = (dmc->assoc * dmc->sysctl_dirty_thresh) / 100; } return 0; } static int #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_lru_hot_pct_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) #else flashcache_lru_hot_pct_sysctl(ctl_table *table, int write, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) struct file *file, #endif void __user *buffer, size_t *length, loff_t *ppos) #endif { struct cache_c *dmc = (struct cache_c *)table->extra1; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) proc_dointvec(table, write, file, buffer, length, ppos); #else proc_dointvec(table, write, buffer, length, ppos); #endif if (write) flashcache_reclaim_rebalance_lru(dmc, dmc->sysctl_lru_hot_pct); return 0; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) #define CTL_UNNUMBERED -2 #endif /* * Each ctl_table array needs to be 1 more than the actual number of * entries - zero padded at the end ! Therefore the NUM_*_SYSCTLS * is 1 more than then number of sysctls. */ #define FLASHCACHE_NUM_WRITEBACK_SYSCTLS 22 static struct flashcache_writeback_sysctl_table { struct ctl_table_header *sysctl_header; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) struct ctl_table vars[FLASHCACHE_NUM_WRITEBACK_SYSCTLS]; struct ctl_table dev[2]; struct ctl_table dir[2]; struct ctl_table root[2]; #else ctl_table vars[FLASHCACHE_NUM_WRITEBACK_SYSCTLS]; ctl_table dev[2]; ctl_table dir[2]; ctl_table root[2]; #endif } flashcache_writeback_sysctl = { .vars = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "io_latency_hist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_io_latency_init, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "do_sync", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_sync_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "stop_sync", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "dirty_thresh_pct", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_dirty_thresh_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "max_clean_ios_total", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "max_clean_ios_set", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "do_pid_expiry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "max_pids", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "pid_expiry_secs", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "reclaim_policy", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "zero_stats", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_zerostats_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, #ifdef notdef /* * Disable this for all except devel builds * If you enable this, you must bump FLASHCACHE_NUM_WRITEBACK_SYSCTLS * by 1 ! */ { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "error_inject", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "fast_remove", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "cache_all", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "fallow_clean_speed", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_fallow_clean_speed_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "fallow_delay", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "skip_seq_thresh_kb", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "clean_on_read_miss", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "clean_on_write_miss", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "lru_promote_thresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "lru_hot_pct", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_lru_hot_pct_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "new_style_write_merge", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, }, .dev = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "flashcache-dev", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = flashcache_writeback_sysctl.vars, }, }, .dir = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = FLASHCACHE_PROC_ROOTDIR_NAME, .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = flashcache_writeback_sysctl.dev, }, }, .root = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_DEV, #endif .procname = "dev", .maxlen = 0, .mode = 0555, .child = flashcache_writeback_sysctl.dir, }, }, }; /* * Each ctl_table array needs to be 1 more than the actual number of * entries - zero padded at the end ! Therefore the NUM_*_SYSCTLS * is 1 more than then number of sysctls. */ #define FLASHCACHE_NUM_WRITETHROUGH_SYSCTLS 11 static struct flashcache_writethrough_sysctl_table { struct ctl_table_header *sysctl_header; #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) struct ctl_table vars[FLASHCACHE_NUM_WRITETHROUGH_SYSCTLS]; struct ctl_table dev[2]; struct ctl_table dir[2]; struct ctl_table root[2]; #else ctl_table vars[FLASHCACHE_NUM_WRITETHROUGH_SYSCTLS]; ctl_table dev[2]; ctl_table dir[2]; ctl_table root[2]; #endif } flashcache_writethrough_sysctl = { .vars = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "io_latency_hist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_io_latency_init, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "do_pid_expiry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "max_pids", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "pid_expiry_secs", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "reclaim_policy", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "zero_stats", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_zerostats_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, #ifdef notdef /* * Disable this for all except devel builds * If you enable this, you must bump FLASHCACHE_NUM_WRITEBACK_SYSCTLS * by 1 ! */ { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "error_inject", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "cache_all", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "skip_seq_thresh_kb", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "lru_promote_thresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "lru_hot_pct", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &flashcache_lru_hot_pct_sysctl, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .strategy = &sysctl_intvec, #endif }, }, .dev = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = "flashcache-dev", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = flashcache_writethrough_sysctl.vars, }, }, .dir = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_UNNUMBERED, #endif .procname = FLASHCACHE_PROC_ROOTDIR_NAME, .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = flashcache_writethrough_sysctl.dev, }, }, .root = { { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) .ctl_name = CTL_DEV, #endif .procname = "dev", .maxlen = 0, .mode = 0555, .child = flashcache_writethrough_sysctl.dir, }, }, }; int * #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) flashcache_find_sysctl_data(struct cache_c *dmc, struct ctl_table *vars) #else flashcache_find_sysctl_data(struct cache_c *dmc, ctl_table *vars) #endif { if (strcmp(vars->procname, "io_latency_hist") == 0) return &dmc->sysctl_io_latency_hist; else if (strcmp(vars->procname, "do_sync") == 0) return &dmc->sysctl_do_sync; else if (strcmp(vars->procname, "stop_sync") == 0) return &dmc->sysctl_stop_sync; else if (strcmp(vars->procname, "dirty_thresh_pct") == 0) return &dmc->sysctl_dirty_thresh; else if (strcmp(vars->procname, "max_clean_ios_total") == 0) return &dmc->max_clean_ios_total; else if (strcmp(vars->procname, "max_clean_ios_set") == 0) return &dmc->max_clean_ios_set; else if (strcmp(vars->procname, "do_pid_expiry") == 0) return &dmc->sysctl_pid_do_expiry; else if (strcmp(vars->procname, "max_pids") == 0) return &dmc->sysctl_max_pids; else if (strcmp(vars->procname, "pid_expiry_secs") == 0) return &dmc->sysctl_pid_expiry_secs; else if (strcmp(vars->procname, "reclaim_policy") == 0) return &dmc->sysctl_reclaim_policy; else if (strcmp(vars->procname, "zero_stats") == 0) return &dmc->sysctl_zerostats; else if (strcmp(vars->procname, "error_inject") == 0) return &dmc->sysctl_error_inject; else if (strcmp(vars->procname, "fast_remove") == 0) return &dmc->sysctl_fast_remove; else if (strcmp(vars->procname, "cache_all") == 0) return &dmc->sysctl_cache_all; else if (strcmp(vars->procname, "fallow_clean_speed") == 0) return &dmc->sysctl_fallow_clean_speed; else if (strcmp(vars->procname, "fallow_delay") == 0) return &dmc->sysctl_fallow_delay; else if (strcmp(vars->procname, "skip_seq_thresh_kb") == 0) return &dmc->sysctl_skip_seq_thresh_kb; else if (strcmp(vars->procname, "clean_on_read_miss") == 0) return &dmc->sysctl_clean_on_read_miss; else if (strcmp(vars->procname, "clean_on_write_miss") == 0) return &dmc->sysctl_clean_on_write_miss; else if (strcmp(vars->procname, "lru_promote_thresh") == 0) return &dmc->sysctl_lru_promote_thresh; else if (strcmp(vars->procname, "lru_hot_pct") == 0) return &dmc->sysctl_lru_hot_pct; else if (strcmp(vars->procname, "new_style_write_merge") == 0) return &dmc->sysctl_new_style_write_merge; printk(KERN_ERR "flashcache_find_sysctl_data: Unknown sysctl %s\n", vars->procname); panic("flashcache_find_sysctl_data: Unknown sysctl %s\n", vars->procname); return NULL; } static void flashcache_writeback_sysctl_register(struct cache_c *dmc) { int i; struct flashcache_writeback_sysctl_table *t; t = kmemdup(&flashcache_writeback_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) return; for (i = 0 ; i < ARRAY_SIZE(t->vars) - 1 ; i++) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->vars[i].de = NULL; #endif t->vars[i].data = flashcache_find_sysctl_data(dmc, &t->vars[i]); t->vars[i].extra1 = dmc; } t->dev[0].procname = flashcache_cons_sysctl_devname(dmc); t->dev[0].child = t->vars; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->dev[0].de = NULL; #endif t->dir[0].child = t->dev; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->dir[0].de = NULL; #endif t->root[0].child = t->dir; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->root[0].de = NULL; #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->sysctl_header = register_sysctl_table(t->root, 0); #else t->sysctl_header = register_sysctl_table(t->root); #endif if (t->sysctl_header == NULL) goto out; dmc->sysctl_handle = t; return; out: kfree(t->dev[0].procname); kfree(t); } static void flashcache_writeback_sysctl_unregister(struct cache_c *dmc) { struct flashcache_writeback_sysctl_table *t; t = dmc->sysctl_handle; if (t != NULL) { dmc->sysctl_handle = NULL; unregister_sysctl_table(t->sysctl_header); kfree(t->dev[0].procname); kfree(t); } } static void flashcache_writethrough_sysctl_register(struct cache_c *dmc) { int i; struct flashcache_writethrough_sysctl_table *t; t = kmemdup(&flashcache_writethrough_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) return; for (i = 0 ; i < ARRAY_SIZE(t->vars) - 1 ; i++) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->vars[i].de = NULL; #endif t->vars[i].data = flashcache_find_sysctl_data(dmc, &t->vars[i]); t->vars[i].extra1 = dmc; } t->dev[0].procname = flashcache_cons_sysctl_devname(dmc); t->dev[0].child = t->vars; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->dev[0].de = NULL; #endif t->dir[0].child = t->dev; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->dir[0].de = NULL; #endif t->root[0].child = t->dir; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->root[0].de = NULL; #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) t->sysctl_header = register_sysctl_table(t->root, 0); #else t->sysctl_header = register_sysctl_table(t->root); #endif if (t->sysctl_header == NULL) goto out; dmc->sysctl_handle = t; return; out: kfree(t->dev[0].procname); kfree(t); } static void flashcache_writethrough_sysctl_unregister(struct cache_c *dmc) { struct flashcache_writethrough_sysctl_table *t; t = dmc->sysctl_handle; if (t != NULL) { dmc->sysctl_handle = NULL; unregister_sysctl_table(t->sysctl_header); kfree(t->dev[0].procname); kfree(t); } } static int flashcache_stats_show(struct seq_file *seq, void *v) { struct cache_c *dmc = seq->private; struct flashcache_stats *stats; int read_hit_pct, write_hit_pct, dirty_write_hit_pct; stats = &dmc->flashcache_stats; if (stats->reads > 0) read_hit_pct = stats->read_hits * 100 / stats->reads; else read_hit_pct = 0; if (stats->writes > 0) { write_hit_pct = stats->write_hits * 100 / stats->writes; dirty_write_hit_pct = stats->dirty_write_hits * 100 / stats->writes; } else { write_hit_pct = 0; dirty_write_hit_pct = 0; } seq_printf(seq, "reads=%lu writes=%lu \n", stats->reads, stats->writes); seq_printf(seq, "read_hits=%lu read_hit_percent=%d ", stats->read_hits, read_hit_pct); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK || dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) { seq_printf(seq, "write_hits=%lu write_hit_percent=%d ", stats->write_hits, write_hit_pct); } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { seq_printf(seq, "dirty_write_hits=%lu dirty_write_hit_percent=%d ", stats->dirty_write_hits, dirty_write_hit_pct); } if (dmc->cache_mode == FLASHCACHE_WRITE_BACK || dmc->cache_mode == FLASHCACHE_WRITE_THROUGH) { seq_printf(seq, "replacement=%lu write_replacement=%lu ", stats->replace, stats->wr_replace); seq_printf(seq, "write_invalidates=%lu read_invalidates=%lu ", stats->wr_invalidates, stats->rd_invalidates); } else { /* WRITE_AROUND */ seq_printf(seq, "replacement=%lu ", stats->replace); seq_printf(seq, "read_invalidates=%lu ", stats->rd_invalidates); } #ifdef FLASHCACHE_DO_CHECKSUMS seq_printf(seq, "checksum_store=%ld checksum_valid=%ld checksum_invalid=%ld ", stats->checksum_store, stats->checksum_valid, stats->checksum_invalid); #endif seq_printf(seq, "pending_enqueues=%lu pending_inval=%lu ", stats->enqueues, stats->pending_inval); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { seq_printf(seq, "metadata_dirties=%lu metadata_cleans=%lu ", stats->md_write_dirty, stats->md_write_clean); seq_printf(seq, "metadata_batch=%lu metadata_ssd_writes=%lu ", stats->md_write_batch, stats->md_ssd_writes); seq_printf(seq, "cleanings=%lu fallow_cleanings=%lu ", stats->cleanings, stats->fallow_cleanings); } seq_printf(seq, "no_room=%lu ", stats->noroom); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) { seq_printf(seq, "front_merge=%lu back_merge=%lu ", stats->front_merge, stats->back_merge); } seq_printf(seq, "disk_reads=%lu disk_writes=%lu ssd_reads=%lu ssd_writes=%lu ", stats->disk_reads, stats->disk_writes, stats->ssd_reads, stats->ssd_writes); seq_printf(seq, "uncached_reads=%lu uncached_writes=%lu uncached_IO_requeue=%lu ", stats->uncached_reads, stats->uncached_writes, stats->uncached_io_requeue); seq_printf(seq, "uncached_sequential_reads=%lu uncached_sequential_writes=%lu ", stats->uncached_sequential_reads, stats->uncached_sequential_writes); seq_printf(seq, "pid_adds=%lu pid_dels=%lu pid_drops=%lu pid_expiry=%lu\n", stats->pid_adds, stats->pid_dels, stats->pid_drops, stats->expiry); return 0; } static int flashcache_stats_open(struct inode *inode, struct file *file) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_stats_show, PDE(inode)->data); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_stats_show, PDE_DATA(inode)); #endif } static struct file_operations flashcache_stats_operations = { .open = flashcache_stats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int flashcache_errors_show(struct seq_file *seq, void *v) { struct cache_c *dmc = seq->private; seq_printf(seq, "disk_read_errors=%d disk_write_errors=%d ", dmc->flashcache_errors.disk_read_errors, dmc->flashcache_errors.disk_write_errors); seq_printf(seq, "ssd_read_errors=%d ssd_write_errors=%d ", dmc->flashcache_errors.ssd_read_errors, dmc->flashcache_errors.ssd_write_errors); seq_printf(seq, "memory_alloc_errors=%d\n", dmc->flashcache_errors.memory_alloc_errors); return 0; } static int flashcache_errors_open(struct inode *inode, struct file *file) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_errors_show, PDE(inode)->data); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_errors_show, PDE_DATA(inode)); #endif } static struct file_operations flashcache_errors_operations = { .open = flashcache_errors_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int flashcache_iosize_hist_show(struct seq_file *seq, void *v) { int i; for (i = 1 ; i <= 32 ; i++) { seq_printf(seq, "%d:%llu ", i*512, size_hist[i]); } seq_printf(seq, "\n"); return 0; } static int flashcache_iosize_hist_open(struct inode *inode, struct file *file) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_iosize_hist_show, PDE(inode)->data); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_iosize_hist_show, PDE_DATA(inode)); #endif } static struct file_operations flashcache_iosize_hist_operations = { .open = flashcache_iosize_hist_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int flashcache_pidlists_show(struct seq_file *seq, void *v) { struct cache_c *dmc = seq->private; struct flashcache_cachectl_pid *pid_list; unsigned long flags; spin_lock_irqsave(&dmc->ioctl_lock, flags); seq_printf(seq, "Blacklist: "); pid_list = dmc->blacklist_head; while (pid_list != NULL) { seq_printf(seq, "%u ", pid_list->pid); pid_list = pid_list->next; } seq_printf(seq, "\n"); seq_printf(seq, "Whitelist: "); pid_list = dmc->whitelist_head; while (pid_list != NULL) { seq_printf(seq, "%u ", pid_list->pid); pid_list = pid_list->next; } seq_printf(seq, "\n"); spin_unlock_irqrestore(&dmc->ioctl_lock, flags); return 0; } static int flashcache_pidlists_open(struct inode *inode, struct file *file) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_pidlists_show, PDE(inode)->data); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_pidlists_show, PDE_DATA(inode)); #endif } static struct file_operations flashcache_pidlists_operations = { .open = flashcache_pidlists_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; extern char *flashcache_sw_version; static int flashcache_version_show(struct seq_file *seq, void *v) { seq_printf(seq, "Flashcache Version : %s\n", flashcache_sw_version); #ifdef COMMIT_REV seq_printf(seq, "git commit: %s\n", COMMIT_REV); #endif return 0; } static int flashcache_version_open(struct inode *inode, struct file *file) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_version_show, PDE(inode)->data); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) return single_open(file, &flashcache_version_show, PDE_DATA(inode)); #endif } static struct file_operations flashcache_version_operations = { .open = flashcache_version_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; void flashcache_module_procfs_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *entry; if (proc_mkdir("flashcache", NULL)) { #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) entry = create_proc_entry("flashcache/flashcache_version", 0, NULL); if (entry) entry->proc_fops = &flashcache_version_operations; #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) entry = proc_create("flashcache/flashcache_version", 0, NULL, &flashcache_version_operations); #endif } #endif /* CONFIG_PROC_FS */ } void flashcache_module_procfs_release(void) { #ifdef CONFIG_PROC_FS (void)remove_proc_entry("flashcache/flashcache_version", NULL); (void)remove_proc_entry("flashcache", NULL); #endif /* CONFIG_PROC_FS */ } static char * flashcache_cons_sysctl_devname(struct cache_c *dmc) { char *pathname; pathname = kzalloc(strlen(dmc->cache_devname) + strlen(dmc->disk_devname) + 2, GFP_KERNEL); strcpy(pathname, strrchr(dmc->cache_devname, '/') + 1); strcat(pathname, "+"); strcat(pathname, strrchr(dmc->disk_devname, '/') + 1); return pathname; } static char * flashcache_cons_procfs_cachename(struct cache_c *dmc, char *path_component) { char *pathname; char *s; pathname = kzalloc(strlen(dmc->cache_devname) + strlen(dmc->disk_devname) + 4 + strlen(FLASHCACHE_PROC_ROOTDIR_NAME) + strlen(path_component), GFP_KERNEL); strcpy(pathname, FLASHCACHE_PROC_ROOTDIR_NAME); strcat(pathname, "/"); s = strrchr(dmc->cache_devname, '/'); if (s) s++; else s = dmc->cache_devname; strcat(pathname, s); strcat(pathname, "+"); s = strrchr(dmc->disk_devname, '/'); if (s) s++; else s = dmc->disk_devname; strcat(pathname, s); if (strcmp(path_component, "") != 0) { strcat(pathname, "/"); strcat(pathname, path_component); } return pathname; } void flashcache_ctr_procfs(struct cache_c *dmc) { char *s; struct proc_dir_entry *entry; s = flashcache_cons_procfs_cachename(dmc, ""); entry = proc_mkdir(s, NULL); kfree(s); if (entry == NULL) return; s = flashcache_cons_procfs_cachename(dmc, "flashcache_stats"); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) entry = create_proc_entry(s, 0, NULL); if (entry) { entry->proc_fops = &flashcache_stats_operations; entry->data = dmc; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) entry = proc_create_data(s, 0, NULL, &flashcache_stats_operations, dmc); #endif kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_errors"); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) entry = create_proc_entry(s, 0, NULL); if (entry) { entry->proc_fops = &flashcache_errors_operations; entry->data = dmc; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) entry = proc_create_data(s, 0, NULL, &flashcache_errors_operations, dmc); #endif kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_iosize_hist"); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) entry = create_proc_entry(s, 0, NULL); if (entry) { entry->proc_fops = &flashcache_iosize_hist_operations; entry->data = dmc; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) entry = proc_create_data(s, 0, NULL, &flashcache_iosize_hist_operations, dmc); #endif kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_pidlists"); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) entry = create_proc_entry(s, 0, NULL); if (entry) { entry->proc_fops = &flashcache_pidlists_operations; entry->data = dmc; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) entry = proc_create_data(s, 0, NULL, &flashcache_pidlists_operations, dmc); #endif kfree(s); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) flashcache_writeback_sysctl_register(dmc); else flashcache_writethrough_sysctl_register(dmc); } void flashcache_dtr_procfs(struct cache_c *dmc) { char *s; s = flashcache_cons_procfs_cachename(dmc, "flashcache_stats"); remove_proc_entry(s, NULL); kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_errors"); remove_proc_entry(s, NULL); kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_iosize_hist"); remove_proc_entry(s, NULL); kfree(s); s = flashcache_cons_procfs_cachename(dmc, "flashcache_pidlists"); remove_proc_entry(s, NULL); kfree(s); s = flashcache_cons_procfs_cachename(dmc, ""); remove_proc_entry(s, NULL); kfree(s); if (dmc->cache_mode == FLASHCACHE_WRITE_BACK) flashcache_writeback_sysctl_unregister(dmc); else flashcache_writethrough_sysctl_unregister(dmc); } flashcache-3.1.3+git20150701/src/flashcache_reclaim.c000066400000000000000000000436401254507146700217300ustar00rootroot00000000000000/**************************************************************************** * flashcache_reclaim.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@facebook.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" static void flashcache_reclaim_remove_block_from_list(struct cache_c *dmc, int index); static void flashcache_reclaim_add_block_to_list_mru(struct cache_c *dmc, int index); static void flashcache_reclaim_add_block_to_list_lru(struct cache_c *dmc, int index); static int flashcache_reclaim_demote_block(struct cache_c *dmc, int index); /* Get least recently used FIFO block */ void flashcache_reclaim_fifo_get_old_block(struct cache_c *dmc, int start_index, int *index) { int set = start_index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; int end_index = start_index + dmc->assoc; int slots_searched = 0; int i; i = cache_set->set_fifo_next; while (slots_searched < dmc->assoc) { VERIFY(i >= start_index); VERIFY(i < end_index); if (dmc->cache[i].cache_state == VALID) { *index = i; VERIFY((dmc->cache[*index].cache_state & FALLOW_DOCLEAN) == 0); break; } slots_searched++; i++; if (i == end_index) i = start_index; } i++; if (i == end_index) i = start_index; cache_set->set_fifo_next = i; } /* Rebalance the hot/warm LRU block sizing in each set */ void flashcache_reclaim_rebalance_lru(struct cache_c *dmc, int new_lru_hot_pct) { int new_hot_blocks, old_hot_blocks; int set, index; struct cache_set *cache_set; struct cacheblock *cacheblk; int blocks_to_move, moved; int start_index; if (new_lru_hot_pct > 100 || new_lru_hot_pct < 0) return; new_hot_blocks = (dmc->assoc * new_lru_hot_pct) / 100; old_hot_blocks = (dmc->assoc * atomic_read(&dmc->hot_list_pct)) / 100; if (new_hot_blocks > old_hot_blocks) { /* Move the requisite blocks from warm list -> hot list for each set */ blocks_to_move = new_hot_blocks - old_hot_blocks; for (set = 0 ; set < (dmc->size >> dmc->assoc_shift) ; set++) { start_index = set * dmc->assoc; cache_set = &dmc->cache_sets[set]; spin_lock_irq(&cache_set->set_spin_lock); moved = 0; while ((cache_set->warmlist_lru_head != FLASHCACHE_NULL) && (moved < blocks_to_move)) { index = cache_set->warmlist_lru_head + start_index; flashcache_reclaim_remove_block_from_list(dmc, index); cacheblk = &dmc->cache[index]; cacheblk->lru_state &= ~LRU_WARM; cacheblk->lru_state |= LRU_HOT; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_lru(dmc, index); moved++; } spin_unlock_irq(&cache_set->set_spin_lock); } } else { /* Move the requisite blocks from hot list -> warm list */ blocks_to_move = old_hot_blocks - new_hot_blocks; for (set = 0 ; set < (dmc->size >> dmc->assoc_shift) ; set++) { start_index = set * dmc->assoc; cache_set = &dmc->cache_sets[set]; spin_lock_irq(&cache_set->set_spin_lock); moved = 0; while ((cache_set->hotlist_lru_head != FLASHCACHE_NULL) && (moved < blocks_to_move)) { index = cache_set->hotlist_lru_head + start_index; flashcache_reclaim_remove_block_from_list(dmc, index); cacheblk = &dmc->cache[index]; cacheblk->lru_state &= ~LRU_HOT; cacheblk->lru_state |= LRU_WARM; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_lru(dmc,index); moved++; } spin_unlock_irq(&cache_set->set_spin_lock); } } atomic_set(&dmc->hot_list_pct, new_lru_hot_pct); } /* For each set, split available blocks into the 2 LRU Queues */ void flashcache_reclaim_init_lru_lists(struct cache_c *dmc) { int hot_blocks_set; int set, j, block_index; struct cache_set *cache_set; int start_index; struct cacheblock *cacheblk; hot_blocks_set = (dmc->assoc * atomic_read(&dmc->hot_list_pct)) / 100; for (set = 0 ; set < (dmc->size >> dmc->assoc_shift) ; set++) { cache_set = &dmc->cache_sets[set]; spin_lock_irq(&cache_set->set_spin_lock); start_index = set * dmc->assoc; for (j = 0 ; j < hot_blocks_set ; j++) { block_index = start_index + j; cacheblk = &dmc->cache[block_index]; cacheblk->lru_prev = FLASHCACHE_NULL; cacheblk->lru_next = FLASHCACHE_NULL; cacheblk->lru_state = LRU_HOT; flashcache_reclaim_add_block_to_list_lru(dmc, block_index); } for ( ; j < dmc->assoc; j++) { block_index = start_index + j; cacheblk = &dmc->cache[block_index]; cacheblk->lru_prev = cacheblk->lru_next = FLASHCACHE_NULL; cacheblk->lru_state = LRU_WARM; flashcache_reclaim_add_block_to_list_lru(dmc, block_index); } spin_unlock_irq(&cache_set->set_spin_lock); } } /* Removes a block from its list */ static void flashcache_reclaim_remove_block_from_list(struct cache_c *dmc, int index) { int set = index / dmc->assoc; int start_index = set * dmc->assoc; struct cacheblock *cacheblk = &dmc->cache[index]; struct cache_set *cache_set = &dmc->cache_sets[set]; /* At least one should be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != 0); /* Both should not be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != (LRU_HOT | LRU_WARM)); if (unlikely((cacheblk->lru_prev == FLASHCACHE_NULL) && (cacheblk->lru_next == FLASHCACHE_NULL))) { /* * Is this the only member on the list ? Or is this not on the list * at all ? */ if (cacheblk->lru_state & LRU_WARM) { if (cache_set->warmlist_lru_head == FLASHCACHE_NULL && cache_set->warmlist_lru_tail == FLASHCACHE_NULL) return; } else { if (cache_set->hotlist_lru_head == FLASHCACHE_NULL && cache_set->hotlist_lru_tail == FLASHCACHE_NULL) return; } } if (cacheblk->lru_prev != FLASHCACHE_NULL) dmc->cache[cacheblk->lru_prev + start_index].lru_next = cacheblk->lru_next; else { if (cacheblk->lru_state & LRU_WARM) cache_set->warmlist_lru_head = cacheblk->lru_next; else cache_set->hotlist_lru_head = cacheblk->lru_next; } if (cacheblk->lru_next != FLASHCACHE_NULL) dmc->cache[cacheblk->lru_next + start_index].lru_prev = cacheblk->lru_prev; else { if (cacheblk->lru_state & LRU_WARM) cache_set->warmlist_lru_tail = cacheblk->lru_prev; else cache_set->hotlist_lru_tail = cacheblk->lru_prev; } if (cacheblk->lru_state & LRU_WARM) { dmc->lru_warm_blocks--; cache_set->lru_warm_blocks--; if (cache_set->lru_warm_blocks == 0) { VERIFY(cache_set->warmlist_lru_head == FLASHCACHE_NULL); VERIFY(cache_set->warmlist_lru_tail == FLASHCACHE_NULL); } if (cache_set->warmlist_lru_head != FLASHCACHE_NULL) VERIFY(cache_set->lru_warm_blocks > 0); if (cache_set->warmlist_lru_tail != FLASHCACHE_NULL) VERIFY(cache_set->lru_warm_blocks > 0); } else { dmc->lru_hot_blocks--; cache_set->lru_hot_blocks--; if (cache_set->lru_hot_blocks == 0) { VERIFY(cache_set->hotlist_lru_head == FLASHCACHE_NULL); VERIFY(cache_set->hotlist_lru_tail == FLASHCACHE_NULL); } if (cache_set->hotlist_lru_head != FLASHCACHE_NULL) VERIFY(cache_set->lru_hot_blocks > 0); if (cache_set->hotlist_lru_tail != FLASHCACHE_NULL) VERIFY(cache_set->lru_hot_blocks > 0); } } /* Adds a block to the MRU position of its list */ static void flashcache_reclaim_add_block_to_list_mru(struct cache_c *dmc, int index) { int set = index / dmc->assoc; int start_index = set * dmc->assoc; int my_index = index - start_index; struct cacheblock *cacheblk = &dmc->cache[index]; struct cache_set *cache_set = &dmc->cache_sets[set]; /* At least one should be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != 0); /* Both should not be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != (LRU_HOT | LRU_WARM)); cacheblk->lru_next = FLASHCACHE_NULL; if (cacheblk->lru_state & LRU_WARM) { cacheblk->lru_prev = cache_set->warmlist_lru_tail; if (cache_set->warmlist_lru_tail == FLASHCACHE_NULL) cache_set->warmlist_lru_head = my_index; else dmc->cache[cache_set->warmlist_lru_tail + start_index].lru_next = my_index; cache_set->warmlist_lru_tail = my_index; } else { cacheblk->lru_prev = cache_set->hotlist_lru_tail; if (cache_set->hotlist_lru_tail == FLASHCACHE_NULL) cache_set->hotlist_lru_head = my_index; else dmc->cache[cache_set->hotlist_lru_tail + start_index].lru_next = my_index; cache_set->hotlist_lru_tail = my_index; } if (cacheblk->lru_state & LRU_WARM) { dmc->lru_warm_blocks++; cache_set->lru_warm_blocks++; } else { dmc->lru_hot_blocks++; cache_set->lru_hot_blocks++; } } /* Adds a block to the LRU position of its list */ static void flashcache_reclaim_add_block_to_list_lru(struct cache_c *dmc, int index) { int set = index / dmc->assoc; int start_index = set * dmc->assoc; int my_index = index - start_index; struct cacheblock *cacheblk = &dmc->cache[index]; struct cache_set *cache_set = &dmc->cache_sets[set]; /* At least one should be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != 0); /* Both should not be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != (LRU_HOT | LRU_WARM)); cacheblk->lru_prev = FLASHCACHE_NULL; if (cacheblk->lru_state & LRU_WARM) { cacheblk->lru_next = cache_set->warmlist_lru_head; if (cache_set->warmlist_lru_head == FLASHCACHE_NULL) cache_set->warmlist_lru_tail = my_index; else dmc->cache[cache_set->warmlist_lru_head + start_index].lru_prev = my_index; cache_set->warmlist_lru_head = my_index; } else { cacheblk->lru_next = cache_set->hotlist_lru_head; if (cache_set->hotlist_lru_head == FLASHCACHE_NULL) cache_set->hotlist_lru_tail = my_index; else dmc->cache[cache_set->hotlist_lru_head + start_index].lru_prev = my_index; cache_set->hotlist_lru_head = my_index; } if (cacheblk->lru_state & LRU_WARM) { cache_set->lru_warm_blocks++; dmc->lru_warm_blocks++; } else { cache_set->lru_hot_blocks++; dmc->lru_hot_blocks++; } } /* Move block to MRU position in the same list */ void flashcache_reclaim_move_to_mru(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; /* At least one should be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != 0); /* Both should not be set */ VERIFY((cacheblk->lru_state & (LRU_WARM | LRU_HOT)) != (LRU_HOT | LRU_WARM)); /* Remove from its list */ flashcache_reclaim_remove_block_from_list(dmc, index); /* And add it to LRU Tail (MRU side) of its list */ flashcache_reclaim_add_block_to_list_mru(dmc, index); } /* Promote this warm block with the LRU block in the hot queue */ static int flashcache_reclaim_promote_block(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; int hot_block; int set = index / dmc->assoc; int start_index = set * dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; VERIFY(cacheblk->lru_state & LRU_WARM); hot_block = cache_set->hotlist_lru_head; if (hot_block == FLASHCACHE_NULL) /* We cannot swap this block into the hot list */ return 0; hot_block += start_index; /* Remove warm block from its list first */ flashcache_reclaim_remove_block_from_list(dmc, index); /* Remove hot block identified above from its list */ flashcache_reclaim_remove_block_from_list(dmc, hot_block); /* Swap the 2 blocks */ cacheblk->lru_state &= ~LRU_WARM; cacheblk->lru_state |= LRU_HOT; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_lru(dmc, index); cacheblk = &dmc->cache[hot_block]; VERIFY(cacheblk->lru_state & LRU_HOT); cacheblk->lru_state &= ~LRU_HOT; cacheblk->lru_state |= LRU_WARM; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_mru(dmc, hot_block); dmc->flashcache_stats.lru_promotions++; return 1; } /* Swap this hot block with the MRU block in the warm queue */ static int flashcache_reclaim_demote_block(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; int warm_block; int set = index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; int start_index = set * dmc->assoc; VERIFY(cacheblk->lru_state & LRU_HOT); warm_block = cache_set->warmlist_lru_tail; if (warm_block == FLASHCACHE_NULL) /* We cannot swap this block into the warm list */ return 0; warm_block += start_index; /* Remove hot block from its list first */ flashcache_reclaim_remove_block_from_list(dmc, index); /* Remove warm block identified above from its list */ flashcache_reclaim_remove_block_from_list(dmc, warm_block); /* Swap the 2 blocks */ cacheblk->lru_state &= ~LRU_HOT; cacheblk->lru_state |= LRU_WARM; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_mru(dmc, index); cacheblk = &dmc->cache[warm_block]; VERIFY(cacheblk->lru_state & LRU_WARM); cacheblk->lru_state &= ~LRU_WARM; cacheblk->lru_state |= LRU_HOT; cacheblk->use_cnt = 0; flashcache_reclaim_add_block_to_list_lru(dmc, warm_block); dmc->flashcache_stats.lru_demotions++; return 1; } /* * Get least recently used LRU block * * Algorithm : * Always pick block from the LRU end of the warm list. * And move it to the MRU end of the warm list. * If we don't find a suitable block in the "warm" list, * pick the block from the hot list, demote it to the warm * list and move a block from the warm list to the hot list. */ void flashcache_reclaim_lru_get_old_block(struct cache_c *dmc, int start_index, int *index) { int lru_rel_index; struct cacheblock *cacheblk; int set = start_index / dmc->assoc; struct cache_set *cache_set = &dmc->cache_sets[set]; *index = -1; lru_rel_index = cache_set->warmlist_lru_head; while (lru_rel_index != FLASHCACHE_NULL) { cacheblk = &dmc->cache[lru_rel_index + start_index]; if (cacheblk->cache_state == VALID) { *index = cacheblk - &dmc->cache[0]; VERIFY((cacheblk->cache_state & FALLOW_DOCLEAN) == 0); VERIFY(cacheblk->lru_state & LRU_WARM); VERIFY((cacheblk->lru_state & LRU_HOT) == 0); cacheblk->use_cnt = 0; flashcache_reclaim_move_to_mru(dmc, *index); break; } lru_rel_index = cacheblk->lru_next; } if (likely(*index != -1)) return; /* * We did not find a block on the "warm" LRU list that we could take, pick * a block from the "hot" LRU list. */ lru_rel_index = cache_set->hotlist_lru_head; while (lru_rel_index != FLASHCACHE_NULL) { cacheblk = &dmc->cache[lru_rel_index + start_index]; if (cacheblk->cache_state == VALID) { *index = cacheblk - &dmc->cache[0]; VERIFY((cacheblk->cache_state & FALLOW_DOCLEAN) == 0); VERIFY(cacheblk->lru_state & LRU_HOT); VERIFY((cacheblk->lru_state & LRU_WARM) == 0); VERIFY(cacheblk->use_cnt == 0); /* * Swap this block with the MRU block in the warm list. * To maintain equilibrium between the lists * 1) We put this block in the MRU position on the warm list * 2) Remove the block in the LRU position on the warm list and * 3) Move that block to the LRU position on the hot list. */ if (!flashcache_reclaim_demote_block(dmc, *index)) /* * We cannot demote this block to the warm list * just move it to the MRU position. */ flashcache_reclaim_move_to_mru(dmc, *index); break; } lru_rel_index = cacheblk->lru_next; } } /* Block moved from warm to hot list on second access */ /* * Block is accessed. * * Algorithm : if (block is in the warm list) { block_lru_refcnt++; if (block_lru_refcnt >= THRESHOLD) { clear refcnt Swap this block for the block at LRU end of hot list } else move it to MRU end of the warm list } if (block is in the hot list) move it to MRU end of the hot list */ void flashcache_lru_accessed(struct cache_c *dmc, int index) { struct cacheblock *cacheblk = &dmc->cache[index]; if (cacheblk->lru_state & LRU_HOT) flashcache_reclaim_move_to_mru(dmc, index); else { /* * If INVALID and on the warm list, just move it to the MRU * position and leave it there. If haven't hit the use count * threshold, move it to the MRU position and leave it there. */ VERIFY(cacheblk->lru_state & LRU_WARM); if (cacheblk->cache_state == INVALID || ++cacheblk->use_cnt < dmc->sysctl_lru_promote_thresh) { flashcache_reclaim_move_to_mru(dmc, index); return; } /* * Promote block to hot list. Swapping it with a block there. * * Swap this block with the LRU block in the hot list. * To maintain equilibrium between the lists * 1) We put this block in the LRU position on the hot list * 2) Remove the block in the LRU position on the hot list and * 3) Move that block to the MRU position on the warm list. */ if (!flashcache_reclaim_promote_block(dmc, index)) /* Could not promote block, move it to mru on warm list */ flashcache_reclaim_move_to_mru(dmc, index); } } flashcache-3.1.3+git20150701/src/flashcache_subr.c000066400000000000000000000670061254507146700212710ustar00rootroot00000000000000/**************************************************************************** * flashcache_subr.c * FlashCache: Device mapper target for block-level disk caching * * Copyright 2010 Facebook, Inc. * Author: Mohan Srinivasan (mohan@fb.com) * * Based on DM-Cache: * Copyright (C) International Business Machines Corp., 2006 * Author: Ming Zhao (mingzhao@ufl.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; under version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . ****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) #include "dm.h" #include "dm-io.h" #include "dm-bio-list.h" #include "kcopyd.h" #else #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) #include "dm.h" #endif #include #include #include #endif #include "flashcache.h" static DEFINE_SPINLOCK(_job_lock); extern mempool_t *_job_pool; extern mempool_t *_pending_job_pool; extern atomic_t nr_cache_jobs; extern atomic_t nr_pending_jobs; LIST_HEAD(_pending_jobs); LIST_HEAD(_io_jobs); LIST_HEAD(_md_io_jobs); LIST_HEAD(_md_complete_jobs); LIST_HEAD(_uncached_io_complete_jobs); LIST_HEAD(_cleaning_read_complete_jobs); LIST_HEAD(_cleaning_write_complete_jobs); int flashcache_cleaning_read_empty(void) { return list_empty(&_cleaning_read_complete_jobs); } int flashcache_cleaning_write_empty(void) { return list_empty(&_cleaning_write_complete_jobs); } int flashcache_pending_empty(void) { return list_empty(&_pending_jobs); } int flashcache_io_empty(void) { return list_empty(&_io_jobs); } int flashcache_md_io_empty(void) { return list_empty(&_md_io_jobs); } int flashcache_md_complete_empty(void) { return list_empty(&_md_complete_jobs); } int flashcache_uncached_io_complete_empty(void) { return list_empty(&_uncached_io_complete_jobs); } struct kcached_job * flashcache_alloc_cache_job(void) { struct kcached_job *job; job = mempool_alloc(_job_pool, GFP_NOIO); if (likely(job)) atomic_inc(&nr_cache_jobs); return job; } void flashcache_free_cache_job(struct kcached_job *job) { mempool_free(job, _job_pool); atomic_dec(&nr_cache_jobs); } struct pending_job * flashcache_alloc_pending_job(struct cache_c *dmc) { struct pending_job *job; job = mempool_alloc(_pending_job_pool, GFP_ATOMIC); if (likely(job)) atomic_inc(&nr_pending_jobs); else dmc->flashcache_errors.memory_alloc_errors++; return job; } void flashcache_free_pending_job(struct pending_job *job) { mempool_free(job, _pending_job_pool); atomic_dec(&nr_pending_jobs); } int flashcache_invalid_get(struct cache_c *dmc, int set) { struct cache_set *cache_set; int index; struct cacheblock *cacheblk; cache_set = &dmc->cache_sets[set]; index = cache_set->invalid_head; if (index == FLASHCACHE_NULL) return -1; index += (set * dmc->assoc); cacheblk = &dmc->cache[index]; VERIFY(cacheblk->cache_state == INVALID); flashcache_invalid_remove(dmc, index); return index; } void flashcache_invalid_insert(struct cache_c *dmc, int index) { struct cache_set *cache_set; struct cacheblock *cacheblk; int set = index / dmc->assoc; int start_index = set * dmc->assoc; int set_ix = index % dmc->assoc; /* index validity checks */ VERIFY(index >= 0); VERIFY(index < dmc->size); cacheblk = &dmc->cache[index]; /* It has to be an INVALID block */ VERIFY(cacheblk->cache_state == INVALID); /* It cannot be on the per-set hash */ VERIFY(cacheblk->hash_prev == FLASHCACHE_NULL); VERIFY(cacheblk->hash_next == FLASHCACHE_NULL); /* Insert this block at the head of the invalid list */ cache_set = &dmc->cache_sets[set]; cacheblk->hash_next = cache_set->invalid_head; if (cache_set->invalid_head != FLASHCACHE_NULL) dmc->cache[start_index + cache_set->invalid_head].hash_prev = set_ix; cache_set->invalid_head = set_ix; } void flashcache_invalid_remove(struct cache_c *dmc, int index) { struct cache_set *cache_set; struct cacheblock *cacheblk; int start_index, set; /* index validity checks */ VERIFY(index >= 0); VERIFY(index < dmc->size); cacheblk = &dmc->cache[index]; /* It has to be an INVALID block */ VERIFY(cacheblk->cache_state == INVALID); set = index / dmc->assoc; start_index = set * dmc-> assoc; cache_set = &dmc->cache_sets[set]; if (cacheblk->hash_prev != FLASHCACHE_NULL) { dmc->cache[start_index + cacheblk->hash_prev].hash_next = cacheblk->hash_next; } else cache_set->invalid_head = cacheblk->hash_next; if (cacheblk->hash_next != FLASHCACHE_NULL) { dmc->cache[start_index + cacheblk->hash_next].hash_prev = cacheblk->hash_prev; } cacheblk->hash_prev = FLASHCACHE_NULL; cacheblk->hash_next = FLASHCACHE_NULL; } /* Cache set block hash management */ void flashcache_hash_init(struct cache_c *dmc) { struct cache_set *cache_set; int i, j; for (i = 0 ; i < (dmc->size >> dmc->assoc_shift) ; i++) { cache_set = &dmc->cache_sets[i]; cache_set->invalid_head = FLASHCACHE_NULL; for (j = 0 ; j < NUM_BLOCK_HASH_BUCKETS ; j++) cache_set->hash_buckets[j] = FLASHCACHE_NULL; } } void flashcache_hash_destroy(struct cache_c *dmc) { } static inline u_int16_t * flashcache_get_hash_bucket(struct cache_c *dmc, struct cache_set *cache_set, sector_t dbn) { unsigned int hash = jhash_1word(dbn, 0xfeed); return &cache_set->hash_buckets[hash % NUM_BLOCK_HASH_BUCKETS]; } void flashcache_hash_remove(struct cache_c *dmc, int index) { struct cache_set *cache_set; struct cacheblock *cacheblk; u_int16_t *hash_bucket; int start_index, set; if (index == -1) return; set = index / dmc->assoc; cache_set = &dmc->cache_sets[set]; cacheblk = &dmc->cache[index]; VERIFY(cacheblk->cache_state & VALID); start_index = set * dmc-> assoc; hash_bucket = flashcache_get_hash_bucket(dmc, cache_set, cacheblk->dbn); if (cacheblk->hash_prev != FLASHCACHE_NULL) { dmc->cache[start_index + cacheblk->hash_prev].hash_next = cacheblk->hash_next; } else *hash_bucket = cacheblk->hash_next; if (cacheblk->hash_next != FLASHCACHE_NULL) { dmc->cache[start_index + cacheblk->hash_next].hash_prev = cacheblk->hash_prev; } cacheblk->hash_prev = FLASHCACHE_NULL; cacheblk->hash_next = FLASHCACHE_NULL; } /* Must return -1 if not found ! */ int flashcache_hash_lookup(struct cache_c *dmc, int set, sector_t dbn) { struct cache_set *cache_set = &dmc->cache_sets[set]; int index; struct cacheblock *cacheblk; u_int16_t set_ix; #if 0 int start_index, end_index, i; #endif set_ix = *flashcache_get_hash_bucket(dmc, cache_set, dbn); while (set_ix != FLASHCACHE_NULL) { index = set * dmc->assoc + set_ix; cacheblk = &dmc->cache[index]; /* Only VALID blocks on the hash queue */ VERIFY(cacheblk->cache_state & VALID); VERIFY((cacheblk->cache_state & INVALID) == 0); if (dbn == cacheblk->dbn) return index; set_ix = cacheblk->hash_next; } #if 0 /* * Debugging. We didn't find the block on the hash. * Make sure it is NOT in this set and VALID ! */ start_index = set * dmc->assoc; end_index = start_index + dmc->assoc; for (i = start_index ; i < end_index ; i++) { cacheblk = &dmc->cache[i]; if (dbn == cacheblk->dbn && (cacheblk->cache_state & VALID)) { printk(KERN_ERR "Did not find block in hash but found block in set !\n"); printk(KERN_ERR "cacheblk->cache_state = %x\n", cacheblk->cache_state); VERIFY(0); panic("Did not find block in hash but found block in set !\n"); } } #endif return -1; } /* * Cacheblock should be VALID and should NOT be on a hash bucket already. */ void flashcache_hash_insert(struct cache_c *dmc, int index) { struct cache_set *cache_set = &dmc->cache_sets[index / dmc->assoc]; struct cacheblock *cacheblk; u_int16_t *hash_bucket; u_int16_t set_ix = index % dmc->assoc; int start_index = (index / dmc->assoc) * dmc->assoc; cacheblk = &dmc->cache[index]; VERIFY(cacheblk->cache_state & VALID); hash_bucket = flashcache_get_hash_bucket(dmc, cache_set, cacheblk->dbn); VERIFY(cacheblk->hash_prev == FLASHCACHE_NULL); VERIFY(cacheblk->hash_next == FLASHCACHE_NULL); cacheblk->hash_prev = FLASHCACHE_NULL; cacheblk->hash_next = *hash_bucket; if (*hash_bucket != FLASHCACHE_NULL) dmc->cache[start_index + *hash_bucket].hash_prev = set_ix; *hash_bucket = set_ix; } #define FLASHCACHE_PENDING_JOB_HASH(INDEX) ((INDEX) % PENDING_JOB_HASH_SIZE) /* * Locking Note : enq/deq pending paths can be called from softirq as well as base * context. Necessary to do the irqsave/restore variants of the lock here. */ void flashcache_enq_pending(struct cache_c *dmc, struct bio* bio, int index, int action, struct pending_job *job) { struct pending_job **head; unsigned long flags; VERIFY(!in_interrupt()); VERIFY(spin_is_locked(&dmc->cache_sets[index / dmc->assoc].set_spin_lock)); spin_lock_irqsave(&dmc->cache_pending_q_spinlock, flags); head = &dmc->pending_job_hashbuckets[FLASHCACHE_PENDING_JOB_HASH(index)]; DPRINTK("flashcache_enq_pending: Queue to pending Q Index %d %llu", index, bio->bi_sector); VERIFY(job != NULL); job->action = action; job->index = index; job->bio = bio; job->prev = NULL; job->next = *head; if (*head) (*head)->prev = job; *head = job; atomic_inc(&dmc->pending_jobs_count); spin_unlock_irqrestore(&dmc->cache_pending_q_spinlock, flags); dmc->cache[index].nr_queued++; dmc->flashcache_stats.enqueues++; } /* * Deq and move all pending jobs that match the index for this slot to list returned */ struct pending_job * flashcache_deq_pending(struct cache_c *dmc, int index) { struct pending_job *node, *next, *movelist = NULL; int moved = 0; struct pending_job **head; unsigned long flags; VERIFY(!in_interrupt()); spin_lock_irqsave(&dmc->cache_pending_q_spinlock, flags); head = &dmc->pending_job_hashbuckets[FLASHCACHE_PENDING_JOB_HASH(index)]; for (node = *head ; node != NULL ; node = next) { next = node->next; if (node->index == index) { /* * Remove pending job from the global list of * jobs and move it to the private list for freeing */ if (node->prev == NULL) { *head = node->next; if (node->next) node->next->prev = NULL; } else node->prev->next = node->next; if (node->next == NULL) { if (node->prev) node->prev->next = NULL; } else node->next->prev = node->prev; node->prev = NULL; node->next = movelist; movelist = node; moved++; } } VERIFY(atomic_read(&dmc->pending_jobs_count) >= moved); atomic_sub(moved, &dmc->pending_jobs_count); spin_unlock_irqrestore(&dmc->cache_pending_q_spinlock, flags); return movelist; } #ifdef FLASHCACHE_DO_CHECKSUMS int flashcache_read_compute_checksum(struct cache_c *dmc, int index, void *block) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) struct io_region where; #else struct dm_io_region where; #endif int error; u_int64_t sum = 0, *idx; int cnt; where.bdev = dmc->cache_dev->bdev; where.sector = INDEX_TO_CACHE_ADDR(dmc, index); where.count = dmc->block_size; error = flashcache_dm_io_sync_vm(dmc, &where, READ, block); if (error) return error; cnt = dmc->block_size * 512; idx = (u_int64_t *)block; while (cnt > 0) { sum += *idx++; cnt -= sizeof(u_int64_t); } dmc->cache[index].checksum = sum; return 0; } u_int64_t flashcache_compute_checksum(struct bio *bio) { int i; u_int64_t sum = 0, *idx; int cnt; int kmap_type; void *kvaddr; if (in_interrupt()) kmap_type = KM_SOFTIRQ0; else kmap_type = KM_USER0; for (i = bio->bi_idx ; i < bio->bi_vcnt ; i++) { kvaddr = kmap_atomic(bio->bi_io_vec[i].bv_page, kmap_type); idx = (u_int64_t *) ((char *)kvaddr + bio->bi_io_vec[i].bv_offset); cnt = bio->bi_io_vec[i].bv_len; while (cnt > 0) { sum += *idx++; cnt -= sizeof(u_int64_t); } kunmap_atomic(kvaddr, kmap_type); } return sum; } void flashcache_store_checksum(struct kcached_job *job) { u_int64_t sum; unsigned long flags; int set = index / dmc->assoc; sum = flashcache_compute_checksum(job->bio); spin_lock_irqsave(&dmc->cache_sets[set].set_spin_lock, flags); job->dmc->cache[job->index].checksum = sum; spin_unlock_irqrestore(&dmc->cache_sets[set].set_spin_lock, flags); } int flashcache_validate_checksum(struct kcached_job *job) { u_int64_t sum; int retval; unsigned long flags; int set = index / dmc->assoc; sum = flashcache_compute_checksum(job->bio); spin_lock_irqsave(&dmc->cache_sets[set].set_spin_lock, flags); if (likely(job->dmc->cache[job->index].checksum == sum)) { job->dmc->flashcache_stats.checksum_valid++; retval = 0; } else { job->dmc->flashcache_stats.checksum_invalid++; retval = 1; } spin_unlock_irqrestore(&dmc->cache_sets[set].set_spin_lock, flags); return retval; } #endif /* * Functions to push and pop a job onto the head of a given job list. */ struct kcached_job * pop(struct list_head *jobs) { struct kcached_job *job = NULL; spin_lock_irq(&_job_lock); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct kcached_job, list); list_del(&job->list); } spin_unlock_irq(&_job_lock); return job; } void push(struct list_head *jobs, struct kcached_job *job) { unsigned long flags; spin_lock_irqsave(&_job_lock, flags); list_add_tail(&job->list, jobs); spin_unlock_irqrestore(&_job_lock, flags); } void push_pending(struct kcached_job *job) { push(&_pending_jobs, job); } void push_io(struct kcached_job *job) { push(&_io_jobs, job); } void push_uncached_io_complete(struct kcached_job *job) { push(&_uncached_io_complete_jobs, job); } void push_md_io(struct kcached_job *job) { push(&_md_io_jobs, job); } void push_md_complete(struct kcached_job *job) { push(&_md_complete_jobs, job); } void push_cleaning(struct list_head *jobs, struct flashcache_copy_job *job) { unsigned long flags; spin_lock_irqsave(&_job_lock, flags); list_add_tail(&job->list, jobs); spin_unlock_irqrestore(&_job_lock, flags); } struct flashcache_copy_job * pop_cleaning(struct list_head *jobs) { struct flashcache_copy_job *job = NULL; spin_lock_irq(&_job_lock); if (!list_empty(jobs)) { job = list_entry(jobs->next, struct flashcache_copy_job, list); list_del(&job->list); } spin_unlock_irq(&_job_lock); return job; } void push_cleaning_read_complete(struct flashcache_copy_job *job) { push_cleaning(&_cleaning_read_complete_jobs, job); } void push_cleaning_write_complete(struct flashcache_copy_job *job) { push_cleaning(&_cleaning_write_complete_jobs, job); } #define FLASHCACHE_YIELD 32 static void process_jobs(struct list_head *jobs, void (*fn) (struct kcached_job *)) { struct kcached_job *job; int done = 0; while ((job = pop(jobs))) { if (done++ >= FLASHCACHE_YIELD) { yield(); done = 0; } (void)fn(job); } } static void process_clean_jobs(struct list_head *jobs, void (*fn) (struct flashcache_copy_job *)) { struct flashcache_copy_job *job; int done = 0; while ((job = pop_cleaning(jobs))) { if (done++ >= FLASHCACHE_YIELD) { yield(); done = 0; } (void)fn(job); } } void #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) do_work(void *unused) #else do_work(struct work_struct *unused) #endif { process_jobs(&_md_complete_jobs, flashcache_md_write_done); process_jobs(&_pending_jobs, flashcache_do_pending); process_jobs(&_md_io_jobs, flashcache_md_write_kickoff); process_jobs(&_io_jobs, flashcache_do_io); process_jobs(&_uncached_io_complete_jobs, flashcache_uncached_io_complete); process_clean_jobs(&_cleaning_read_complete_jobs, flashcache_clean_write_kickoff); process_clean_jobs(&_cleaning_write_complete_jobs, flashcache_clean_md_write_kickoff); } struct kcached_job * new_kcached_job(struct cache_c *dmc, struct bio* bio, int index) { struct kcached_job *job; job = flashcache_alloc_cache_job(); if (unlikely(job == NULL)) { dmc->flashcache_errors.memory_alloc_errors++; return NULL; } job->dmc = dmc; job->index = index; job->job_io_regions.cache.bdev = dmc->cache_dev->bdev; if (index != -1) { job->job_io_regions.cache.sector = INDEX_TO_CACHE_ADDR(dmc, index); job->job_io_regions.cache.count = dmc->block_size; } job->error = 0; job->bio = bio; job->job_io_regions.disk.bdev = dmc->disk_dev->bdev; if (index != -1) { job->job_io_regions.disk.sector = dmc->cache[index].dbn; job->job_io_regions.disk.count = dmc->block_size; } else { job->job_io_regions.disk.sector = bio->bi_sector; job->job_io_regions.disk.count = to_sector(bio->bi_size); } job->next = NULL; job->md_block = NULL; if (dmc->sysctl_io_latency_hist) do_gettimeofday(&job->io_start_time); else { job->io_start_time.tv_sec = 0; job->io_start_time.tv_usec = 0; } return job; } static void flashcache_record_latency(struct cache_c *dmc, struct timeval *start_tv) { struct timeval latency; int64_t us; do_gettimeofday(&latency); latency.tv_sec -= start_tv->tv_sec; latency.tv_usec -= start_tv->tv_usec; us = latency.tv_sec * USEC_PER_SEC + latency.tv_usec; us /= IO_LATENCY_GRAN_USECS; /* histogram 250us gran, scale 10ms total */ if (us < IO_LATENCY_BUCKETS) /* < 10ms latency, track it */ dmc->latency_hist[us]++; else /* else count it in 10ms+ bucket */ dmc->latency_hist_10ms++; } void flashcache_bio_endio(struct bio *bio, int error, struct cache_c *dmc, struct timeval *start_time) { if (unlikely(dmc->sysctl_io_latency_hist && start_time != NULL && start_time->tv_sec != 0)) flashcache_record_latency(dmc, start_time); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) bio_endio(bio, bio->bi_size, error); #else bio_endio(bio, error); #endif } static int cmp_dbn(const void *a, const void *b) { if (((struct dbn_index_pair *)a)->dbn < ((struct dbn_index_pair *)b)->dbn) return -1; else return 1; } static void swap_dbn_index_pair(void *a, void *b, int size) { struct dbn_index_pair temp; temp = *(struct dbn_index_pair *)a; *(struct dbn_index_pair *)a = *(struct dbn_index_pair *)b; *(struct dbn_index_pair *)b = temp; } /* * We have a list of blocks to write out to disk. * 1) Sort the blocks by dbn. * 2) (sysctl'able) See if there are any other blocks in the same set * that are contig to any of the blocks in step 1. If so, include them * in our "to write" set, maintaining sorted order. * Has to be called under the cache spinlock ! */ void flashcache_merge_writes(struct cache_c *dmc, struct dbn_index_pair *writes_list, struct dbn_index_pair *set_dirty_list, int *nr_writes, int set) { int dirty_blocks_in = *nr_writes; struct cacheblock *cacheblk; int i; int neighbor; VERIFY(spin_is_locked(&dmc->cache_sets[set].set_spin_lock)); if (unlikely(*nr_writes == 0)) return; /* * Loop over the blocks, searching for neighbors backwards and forwards. * When we find a neighbor, tack it onto writes_list. */ for (i = 0 ; i < dirty_blocks_in ; i++) { /* Look behind and keep merging as long as we can */ neighbor = flashcache_hash_lookup(dmc, set, writes_list[i].dbn - dmc->block_size); while (neighbor != -1) { cacheblk = &dmc->cache[neighbor]; VERIFY(cacheblk->cache_state & VALID); if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { /* Found a dirty neighbor. Add it to the writes_list */ cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, neighbor); VERIFY(*nr_writes < dmc->assoc); writes_list[*nr_writes].index = neighbor; writes_list[*nr_writes].dbn = cacheblk->dbn; (*nr_writes)++; dmc->flashcache_stats.back_merge++; neighbor = flashcache_hash_lookup(dmc, set, cacheblk->dbn - dmc->block_size); } else neighbor = -1; } /* Look forward and keep merging as long as we can */ neighbor = flashcache_hash_lookup(dmc, set, writes_list[i].dbn + dmc->block_size); while (neighbor != -1) { cacheblk = &dmc->cache[neighbor]; VERIFY(cacheblk->cache_state & VALID); if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { /* Found a dirty neighbor. Add it to the writes_list */ cacheblk->cache_state |= DISKWRITEINPROG; flashcache_clear_fallow(dmc, neighbor); VERIFY(*nr_writes < dmc->assoc); writes_list[*nr_writes].index = neighbor; writes_list[*nr_writes].dbn = cacheblk->dbn; (*nr_writes)++; dmc->flashcache_stats.front_merge++; neighbor = flashcache_hash_lookup(dmc, set, cacheblk->dbn + dmc->block_size); } else neighbor = -1; } } /* This may be unnecessary. But return the list of blocks to write out sorted */ sort(writes_list, *nr_writes, sizeof(struct dbn_index_pair), cmp_dbn, swap_dbn_index_pair); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) extern struct dm_io_client *flashcache_io_client; /* Client memory pool*/ #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) int flashcache_dm_io_async_vm(struct cache_c *dmc, unsigned int num_regions, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region *where, #else struct dm_io_region *where, #endif int rw, void *data, io_notify_fn fn, void *context) { unsigned long error_bits = 0; int error; struct dm_io_request io_req = { .bi_rw = rw, .mem.type = DM_IO_VMA, .mem.ptr.vma = data, .mem.offset = 0, .notify.fn = fn, .notify.context = context, .client = flashcache_io_client, }; error = dm_io(&io_req, 1, where, &error_bits); if (error) return error; if (error_bits) return error_bits; return 0; } #endif #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,29) /* * Wrappers for doing DM sync IO, using DM async IO. * It is a shame we need do this, but DM sync IO is interruptible :( * And we want uninterruptible disk IO :) * * This is fixed in 2.6.30, where sync DM IO is uninterruptible. */ #define FLASHCACHE_DM_IO_SYNC_INPROG 0x01 static DECLARE_WAIT_QUEUE_HEAD(flashcache_dm_io_sync_waitqueue); static DEFINE_SPINLOCK(flashcache_dm_io_sync_spinlock); struct flashcache_dm_io_sync_state { int error; int flags; }; static void flashcache_dm_io_sync_vm_callback(unsigned long error, void *context) { struct flashcache_dm_io_sync_state *state = (struct flashcache_dm_io_sync_state *)context; unsigned long flags; spin_lock_irqsave(&flashcache_dm_io_sync_spinlock, flags); state->flags &= ~FLASHCACHE_DM_IO_SYNC_INPROG; state->error = error; wake_up(&flashcache_dm_io_sync_waitqueue); spin_unlock_irqrestore(&flashcache_dm_io_sync_spinlock, flags); } int flashcache_dm_io_sync_vm(struct cache_c *dmc, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) struct io_region *where, #else struct dm_io_region *where, #endif int rw, void *data) { DEFINE_WAIT(wait); struct flashcache_dm_io_sync_state state; state.error = -EINTR; state.flags = FLASHCACHE_DM_IO_SYNC_INPROG; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) dm_io_async_vm(1, where, rw, data, flashcache_dm_io_sync_vm_callback, &state); #else flashcache_dm_io_async_vm(dmc, 1, where, rw, data, flashcache_dm_io_sync_vm_callback, &state); #endif spin_lock_irq(&flashcache_dm_io_sync_spinlock); while (state.flags & FLASHCACHE_DM_IO_SYNC_INPROG) { prepare_to_wait(&flashcache_dm_io_sync_waitqueue, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&flashcache_dm_io_sync_spinlock); schedule(); spin_lock_irq(&flashcache_dm_io_sync_spinlock); } finish_wait(&flashcache_dm_io_sync_waitqueue, &wait); spin_unlock_irq(&flashcache_dm_io_sync_spinlock); return state.error; } #else int flashcache_dm_io_sync_vm(struct cache_c *dmc, struct dm_io_region *where, int rw, void *data) { unsigned long error_bits = 0; int error; struct dm_io_request io_req = { .bi_rw = rw, .mem.type = DM_IO_VMA, .mem.ptr.vma = data, .mem.offset = 0, .notify.fn = NULL, .client = flashcache_io_client, }; error = dm_io(&io_req, 1, where, &error_bits); if (error) return error; if (error_bits) return error_bits; return 0; } #endif void flashcache_update_sync_progress(struct cache_c *dmc) { u_int64_t dirty_pct; if (dmc->flashcache_stats.cleanings % 1000) return; if (!atomic_read(&dmc->nr_dirty) || !dmc->size || !printk_ratelimit()) return; dirty_pct = ((u_int64_t)atomic_read(&dmc->nr_dirty) * 100) / dmc->size; printk(KERN_INFO "Flashcache: Cleaning %d Dirty blocks, Dirty Blocks pct %llu%%", atomic_read(&dmc->nr_dirty), dirty_pct); printk(KERN_INFO "\r"); } #define NUM_DISKCLEAN_BLOCKS 32 int flashcache_diskclean_init(struct cache_c *dmc) { int i; void *buf; struct diskclean_buf_ *diskclean_buf; dmc->diskclean_buf_head = NULL; spin_lock_init(&dmc->diskclean_list_lock); /* Allocate the buffers and push them onto the list */ for (i = 0 ; i < NUM_DISKCLEAN_BLOCKS ; i++) { buf = vmalloc(dmc->assoc * sizeof(struct dbn_index_pair)); if (!buf) { /* Free everything allocated up to now and return error */ flashcache_diskclean_destroy(dmc); return 1; } diskclean_buf = (struct diskclean_buf_ *)buf; diskclean_buf->next = dmc->diskclean_buf_head; dmc->diskclean_buf_head = diskclean_buf; } return 0; } void flashcache_diskclean_destroy(struct cache_c *dmc) { struct diskclean_buf_ *diskclean_buf, *next; diskclean_buf = dmc->diskclean_buf_head; while (diskclean_buf != NULL) { next = diskclean_buf->next; vfree(diskclean_buf); diskclean_buf = next; } } int flashcache_diskclean_alloc(struct cache_c *dmc, struct dbn_index_pair **buf1, struct dbn_index_pair **buf2) { unsigned long flags; int retval; *buf1 = NULL; *buf2 = NULL; spin_lock_irqsave(&dmc->diskclean_list_lock, flags); if (dmc->diskclean_buf_head == NULL || dmc->diskclean_buf_head->next == NULL) { retval = ENOMEM; goto out; } *buf1 = (struct dbn_index_pair *)dmc->diskclean_buf_head; *buf2 = (struct dbn_index_pair *)dmc->diskclean_buf_head->next; dmc->diskclean_buf_head = dmc->diskclean_buf_head->next->next; retval = 0; out: spin_unlock_irqrestore(&dmc->diskclean_list_lock, flags); return retval; } void flashcache_diskclean_free(struct cache_c *dmc, struct dbn_index_pair *buf1, struct dbn_index_pair *buf2) { unsigned long flags; struct diskclean_buf_ *diskclean_buf; VERIFY(buf1 != NULL); VERIFY(buf2 != NULL); spin_lock_irqsave(&dmc->diskclean_list_lock, flags); diskclean_buf = (struct diskclean_buf_ *)buf1; diskclean_buf->next = dmc->diskclean_buf_head; dmc->diskclean_buf_head = diskclean_buf; diskclean_buf = (struct diskclean_buf_ *)buf2; diskclean_buf->next = dmc->diskclean_buf_head; dmc->diskclean_buf_head = diskclean_buf; spin_unlock_irqrestore(&dmc->diskclean_list_lock, flags); } EXPORT_SYMBOL(flashcache_alloc_cache_job); EXPORT_SYMBOL(flashcache_free_cache_job); EXPORT_SYMBOL(flashcache_alloc_pending_job); EXPORT_SYMBOL(flashcache_free_pending_job); EXPORT_SYMBOL(pop); EXPORT_SYMBOL(push); EXPORT_SYMBOL(push_pending); EXPORT_SYMBOL(push_io); EXPORT_SYMBOL(push_md_io); EXPORT_SYMBOL(push_md_complete); EXPORT_SYMBOL(process_jobs); EXPORT_SYMBOL(do_work); EXPORT_SYMBOL(new_kcached_job); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) EXPORT_SYMBOL(flashcache_dm_io_sync_vm_callback); #endif EXPORT_SYMBOL(flashcache_dm_io_sync_vm); EXPORT_SYMBOL(flashcache_merge_writes); EXPORT_SYMBOL(flashcache_enq_pending); flashcache-3.1.3+git20150701/src/ocf/000077500000000000000000000000001254507146700165475ustar00rootroot00000000000000flashcache-3.1.3+git20150701/src/ocf/Makefile000066400000000000000000000004641254507146700202130ustar00rootroot00000000000000COMMIT_REV ?= $(shell git describe --always --abbrev=12) CFLAGS += PROGRAMS += flashcache INSTALL_DIR = /usr/lib/ocf/resource.d/flashcache .PHONY:all all: $(PROGRAMS) .PHONY: install install: $(PROGRAMS) install -d -m 755 $(INSTALL_DIR) install -m 755 $(PROGRAMS) $(INSTALL_DIR) .PHONY: clean clean: flashcache-3.1.3+git20150701/src/ocf/flashcache000077500000000000000000000206011254507146700205550ustar00rootroot00000000000000#!/bin/sh # # License: GNU General Public License (GPL) # # Resource Agent for highly available flashcache devices. # Requires installed flashcache kernel module and utilities. # # (c) 2011 Florian Haas # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/resource.d/heartbeat} . ${OCF_FUNCTIONS_DIR}/.ocf-shellfuncs # Defaults OCF_RESKEY_name_default="flashcache" : ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} flashcache_usage() { echo "usage: $0 {start|stop|status|monitor|meta-data|validate-all}" } flashcache_meta_data() { cat < 0.1 This resource agent manages a flashcache device, loading any existing cache from the flash device on startup and flushing the cache to the disk on graceful shutdown. Manages a flashcache device map The name of the flashcache device. This is the device map name that the agent instructs device-mapper to create, and must hence follow device-mapper naming restrictions. Flashcache device name The backing device to be used by flashcache. This is typically a comparatively high-latency but high-capacity block device, such as a rotational disk. Backing device (typically a rotational disk) The cache device to be used by flashcache. This is typically a low-latency but limited-capacity block device, such as a solid-state disk. Cache device (typically a solid state disk) EOF } flashcache_start() { # exit immediately if configuration is not valid flashcache_validate_all || exit $? # if resource is already running, bail out early if flashcache_monitor; then ocf_log info "Resource is already running" return $OCF_SUCCESS fi # If the file exists here, but flashcache_monitor has determined # the resource isn't already running, then the file probably is # owned by something else. Bail out to avoid breaking things. if [ -e /dev/mapper/${OCF_RESKEY_name} ]; then ocf_log err "Existing file /dev/mapper/${OCF_RESKEY_name} would be overwritten by ${OCF_RESOURCE_INSTANCE}. Bailing out." exit $OCF_ERR_INSTALLED fi if [ ! -e /proc/flashcache/flashcache_version ]; then ocf_log debug "Flashcache support not loaded, loading module" ocf_run modprobe -v flashcache || exit $OCF_ERR_INSTALLED fi ocf_log debug "Flashcache module information obtained from kernel: `cat /proc/flashcache/flashcache_version`" # actually start up the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run flashcache_load ${OCF_RESKEY_cache_device} \ || exit $OCF_ERR_GENERIC # After the resource has been started, check whether it started up # correctly. If the resource starts asynchronously, the agent may # spin on the monitor function here -- if the resource does not # start up within the defined timeout, the cluster manager will # consider the start action failed while ! flashcache_monitor; do ocf_log debug "Resource has not started yet, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } flashcache_stop() { local rc # exit immediately if configuration is not valid flashcache_validate_all || exit $? flashcache_monitor rc=$? case "$rc" in "$OCF_SUCCESS") # Currently running. Normal, expected behavior. ocf_log debug "Resource is currently running" ;; "$OCF_NOT_RUNNING") # Currently not running. Nothing to do. ocf_log info "Resource is already stopped" return $OCF_SUCCESS ;; esac # actually shut down the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run dmsetup remove ${OCF_RESKEY_name} || exit $OCF_ERR_GENERIC # After the resource has been stopped, check whether it shut down # correctly. If the resource stops asynchronously, the agent may # spin on the monitor function here -- if the resource does not # shut down within the defined timeout, the cluster manager will # consider the stop action failed while flashcache_monitor; do ocf_log debug "Resource has not stopped yet, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } flashcache_monitor() { local rc local blockdev local device_present local map_present # exit immediately if configuration is not valid flashcache_validate_all || exit $? # First, see if a block device exists in /dev/mapper blockdev=/dev/mapper/${OCF_RESKEY_name} if [ -e ${blockdev} ]; then if [ -b ${blockdev} ]; then case "`stat -L -c "%t" ${blockdev}`" in "fc"|"fd") ocf_log debug "Block device ${blockdev} exists and is a device-mapper device" ;; *) ocf_log warn "Existing block device ${blockdev} is not a device-mapper device!" return $OCF_NOT_RUNNING esac else ocf_log warn "File ${blockdev} exists, but is not a block device!" fi fi # OK, we have a block device and it has the correct major # number. Now, check if there is an entry in the DM table for the # device. dmsetup ls | grep -Eq "^${OCF_RESKEY_name}[[:space:]]+" if [ $? -eq 0 ]; then ocf_log debug "Device map \"${OCF_RESKEY_name}\" is present" # So we have a block device, and we have a device mapper table # entry. Good enough for now. # # TODO: For an added paranoia check, test whether the minor # number in the table matches the one stat() returns on the # device. return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } flashcache_validate_all() { # Check required parameters if [ -z "${OCF_RESKEY_device}" ]; then ocf_log err "Required parameter \"device\" not configured!" exit $OCF_ERR_CONFIGURED fi if [ -z "${OCF_RESKEY_cache_device}" ]; then ocf_log err "Required parameter \"cache_device\" not configured!" exit $OCF_ERR_CONFIGURED fi # Test for required binaries check_binary flashcache_load check_binary dmsetup check_binary stat check_binary grep if ! ocf_is_probe; then for dev in ${OCF_RESKEY_device} ${OCF_RESKEY_cache_device}; do if [ ! -b ${dev} ]; then ocf_log err "${dev} does not exist or is not a block device!" exit $OCF_ERR_INSTALLED fi done fi return $OCF_SUCCESS } # Make sure meta-data and usage always succeed case $__OCF_ACTION in meta-data) flashcache_meta_data exit $OCF_SUCCESS ;; usage|help) flashcache_usage exit $OCF_SUCCESS ;; esac # Anything other than meta-data and usage must pass validation flashcache_validate_all || exit $? # Translate each action into the appropriate function call case $__OCF_ACTION in start) flashcache_start;; stop) flashcache_stop;; status|monitor) flashcache_monitor;; reload) ocf_log info "Reloading..." flashcache_start ;; validate-all) ;; *) flashcache_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" exit $rc flashcache-3.1.3+git20150701/src/utils/000077500000000000000000000000001254507146700171405ustar00rootroot00000000000000flashcache-3.1.3+git20150701/src/utils/Makefile000066400000000000000000000016451254507146700206060ustar00rootroot00000000000000COMMIT_REV ?= $(shell git describe --always --abbrev=12) CFLAGS += -I.. -I. -DCOMMIT_REV="\"$(COMMIT_REV)\"" -g PROGRAMS += flashcache_create flashcache_destroy flashcache_load flashcache_setioctl get_agsize INSTALL_DIR = $(DESTDIR)/sbin/ .PHONY:all all: $(PROGRAMS) get_agsize: get_agsize.o $(CC) $^ -o $@ -include get_agsize.d flashcache_create: flashcache_create.o $(CC) $^ -o $@ -include flashcache_create.d flashcache_destroy: flashcache_destroy.o $(CC) $^ -o $@ -include flashcache_destroy.d flashcache_load: flashcache_load.o $(CC) $^ -o $@ -include flashcache_load.d flashcache_setioctl: flashcache_setioctl.o $(CC) $^ -o $@ -include flashcache_setioctl.d %.o: %.c $(CC) -c $(CFLAGS) $*.c -o $*.o @$(CC) -MM $(CFLAGS) -MF $*.d -MT $*.o $*.c .PHONY: install install: $(PROGRAMS) install -d -m 755 $(INSTALL_DIR) install -m 755 $(PROGRAMS) $(INSTALL_DIR) .PHONY: clean clean: rm -f *.[od] $(PROGRAMS) flashcache-3.1.3+git20150701/src/utils/flashcache_create.c000066400000000000000000000246441254507146700227220ustar00rootroot00000000000000/* * Copyright (c) 2010, Facebook, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef COMMIT_REV void usage(char *pname) { fprintf(stderr, "Usage: %s [-v] [-p back|thru|around] [-w] [-b block size] [-m md block size] [-s cache size] [-a associativity] cachedev ssd_devname disk_devname\n", pname); fprintf(stderr, "Usage : %s Cache Mode back|thru|around is required argument\n", pname); fprintf(stderr, "Usage : %s Default units for -b, -m, -s are sectors, or specify in k/M/G. Default associativity is 512.\n", pname); #ifdef COMMIT_REV fprintf(stderr, "git commit: %s\n", COMMIT_REV); #endif exit(1); } char *pname; char buf[512]; char dmsetup_cmd[8192]; int verbose = 0; int force = 0; int write_cache_only = 0; static sector_t get_block_size(char *s) { sector_t size; char *c; size = strtoll(s, NULL, 0); for (c = s; isdigit(*c); c++) ; switch (*c) { case '\0': break; case 'k': size = (size * 1024) / 512; break; default: fprintf (stderr, "%s: Unknown block size type %c\n", pname, *c); exit (1); } if (size & (size - 1)) { fprintf(stderr, "%s: Block size must be a power of 2\n", pname); exit(1); } return size; } static sector_t get_cache_size(char *s) { sector_t size; char *c; size = strtoll(s, NULL, 0); for (c = s; isdigit (*c); c++) ; switch (*c) { case '\0': break; case 'k': size = (size * 1024) / 512; break; case 'm': case 'M': size = (size * 1024 * 1024) / 512; break; case 'g': case 'G': size = (size * 1024 * 1024 * 1024) / 512; break; case 't': case 'T': /* Cache size in terabytes? You lucky people! */ size = (size * 1024 * 1024 * 1024 * 1024) / 512; break; default: fprintf (stderr, "%s: Unknown cache size type %c\n", pname, *c); exit (1); } return size; } static int module_loaded(void) { FILE *fp; char line[8192]; int found = 0; fp = fopen("/proc/modules", "ro"); while (fgets(line, 8190, fp)) { char *s; s = strtok(line, " "); if (!strcmp(s, "flashcache")) { found = 1; break; } } fclose(fp); return found; } static void load_module(void) { FILE *fp; char line[8192]; if (!module_loaded()) { if (verbose) fprintf(stderr, "Loading Flashcache Module\n"); system("modprobe flashcache"); if (!module_loaded()) { fprintf(stderr, "Could not load Flashcache Module\n"); exit(1); } } else if (verbose) fprintf(stderr, "Flashcache Module already loaded\n"); fp = fopen("/proc/flashcache/flashcache_version", "ro"); fgets(line, 8190, fp); if (fgets(line, 8190, fp)) { if (verbose) fprintf(stderr, "version string \"%s\"\n", line); #ifdef COMMIT_REV if (!strstr(line, COMMIT_REV)) { fprintf(stderr, "Flashcache revision doesn't match tool revision.\n"); exit(1); } #endif } fclose(fp); } static void check_sure(void) { char input; fprintf(stderr, "Are you sure you want to proceed ? (y/n): "); scanf("%c", &input); printf("\n"); if (input != 'y') { fprintf(stderr, "Exiting FlashCache creation\n"); exit(1); } } int main(int argc, char **argv) { int cache_fd, disk_fd, c; char *disk_devname, *ssd_devname, *cachedev; struct flash_superblock *sb = (struct flash_superblock *)buf; sector_t cache_devsize, disk_devsize; sector_t block_size = 0, md_block_size = 0, cache_size = 0; sector_t ram_needed; struct sysinfo i; int cache_sectorsize; int associativity = 512; int disk_associativity = 0; int ret; int cache_mode = -1; char *cache_mode_str; pname = argv[0]; while ((c = getopt(argc, argv, "fs:b:d:m:va:p:w")) != -1) { switch (c) { case 's': cache_size = get_cache_size(optarg); break; case 'a': associativity = atoi(optarg); break; case 'b': block_size = get_block_size(optarg); /* Block size should be a power of 2 */ break; case 'd': disk_associativity = get_block_size(optarg); break; case 'm': md_block_size = get_block_size(optarg); /* MD block size should be a power of 2 */ break; case 'v': verbose = 1; break; case 'f': force = 1; break; case 'p': if (strcmp(optarg, "back") == 0) { cache_mode = FLASHCACHE_WRITE_BACK; cache_mode_str = "WRITE_BACK"; } else if ((strcmp(optarg, "thru") == 0) || (strcmp(optarg, "through") == 0)) { cache_mode = FLASHCACHE_WRITE_THROUGH; cache_mode_str = "WRITE_THROUGH"; } else if (strcmp(optarg, "around") == 0) { cache_mode = FLASHCACHE_WRITE_AROUND; cache_mode_str = "WRITE_AROUND"; } else usage(pname); break; case 'w': write_cache_only = 1; break; case '?': usage(pname); } } if (cache_mode == -1) usage(pname); if (optind == argc) usage(pname); if (block_size == 0) block_size = 8; /* 4KB default blocksize */ if (md_block_size == 0) md_block_size = 8; /* 4KB default blocksize */ cachedev = argv[optind++]; if (optind == argc) usage(pname); ssd_devname = argv[optind++]; if (optind == argc) usage(pname); disk_devname = argv[optind]; printf("cachedev %s, ssd_devname %s, disk_devname %s cache mode %s\n", cachedev, ssd_devname, disk_devname, cache_mode_str); if (cache_mode == FLASHCACHE_WRITE_BACK) printf("block_size %lu, md_block_size %lu, cache_size %lu\n", block_size, md_block_size, cache_size); else printf("block_size %lu, cache_size %lu\n", block_size, cache_size); cache_fd = open(ssd_devname, O_RDONLY); if (cache_fd < 0) { fprintf(stderr, "Failed to open %s\n", ssd_devname); exit(1); } lseek(cache_fd, 0, SEEK_SET); if (read(cache_fd, buf, 512) < 0) { fprintf(stderr, "Cannot read Flashcache superblock %s\n", ssd_devname); exit(1); } if (sb->cache_sb_state == CACHE_MD_STATE_DIRTY || sb->cache_sb_state == CACHE_MD_STATE_CLEAN || sb->cache_sb_state == CACHE_MD_STATE_FASTCLEAN || sb->cache_sb_state == CACHE_MD_STATE_UNSTABLE) { fprintf(stderr, "%s: Valid Flashcache already exists on %s\n", pname, ssd_devname); fprintf(stderr, "%s: Use flashcache_destroy first and then create again %s\n", pname, ssd_devname); exit(1); } disk_fd = open(disk_devname, O_RDONLY); if (disk_fd < 0) { fprintf(stderr, "%s: Failed to open %s\n", pname, disk_devname); exit(1); } if (ioctl(cache_fd, BLKGETSIZE, &cache_devsize) < 0) { fprintf(stderr, "%s: Cannot get cache size %s\n", pname, ssd_devname); exit(1); } if (ioctl(disk_fd, BLKGETSIZE, &disk_devsize) < 0) { fprintf(stderr, "%s: Cannot get disk size %s\n", pname, disk_devname); exit(1); } if (ioctl(cache_fd, BLKSSZGET, &cache_sectorsize) < 0) { fprintf(stderr, "%s: Cannot get cache size %s\n", pname, ssd_devname); exit(1); } if (md_block_size > 0 && md_block_size * 512 < cache_sectorsize) { fprintf(stderr, "%s: SSD device (%s) sector size (%d) cannot be larger than metadata block size (%d) !\n", pname, ssd_devname, cache_sectorsize, md_block_size * 512); exit(1); } if (cache_size && cache_size > cache_devsize) { fprintf(stderr, "%s: Cache size is larger than ssd size %lu/%lu\n", pname, cache_size, cache_devsize); exit(1); } /* Remind users how much core memory it will take - not always insignificant. * If it's > 25% of RAM, warn. */ if (cache_size == 0) ram_needed = (cache_devsize / block_size) * sizeof(struct cacheblock); /* Whole device */ else ram_needed = (cache_size / block_size) * sizeof(struct cacheblock); sysinfo(&i); printf("Flashcache metadata will use %luMB of your %luMB main memory\n", ram_needed >> 20, i.totalram >> 20); if (!force && ram_needed > (i.totalram * 25 / 100)) { fprintf(stderr, "Proportion of main memory needed for flashcache metadata is high.\n"); fprintf(stderr, "You can reduce this with a smaller cache or a larger blocksize.\n"); check_sure(); } if (disk_associativity > associativity) { fprintf(stderr, "%s: Invalid Disk Associativity %ld\n", pname, disk_associativity); exit(1); } if (!force && cache_size > disk_devsize) { fprintf(stderr, "Size of cache volume (%s) is larger than disk volume (%s)\n", ssd_devname, disk_devname); check_sure(); } sprintf(dmsetup_cmd, "echo 0 %lu flashcache %s %s %s %d 2 %lu %lu %d %lu %d %lu" " | dmsetup create %s", disk_devsize, disk_devname, ssd_devname, cachedev, cache_mode, block_size, cache_size, associativity, disk_associativity, write_cache_only, md_block_size, cachedev); /* Go ahead and create the cache. * XXX - Should use the device mapper library for this. */ load_module(); if (verbose) fprintf(stderr, "Creating FlashCache Volume : \"%s\"\n", dmsetup_cmd); ret = system(dmsetup_cmd); if (ret) { fprintf(stderr, "%s failed\n", dmsetup_cmd); exit(1); } return 0; } flashcache-3.1.3+git20150701/src/utils/flashcache_destroy.c000066400000000000000000000116701254507146700231430ustar00rootroot00000000000000/* * Copyright (c) 2010, Facebook, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include int force = 0; void usage(char *pname) { fprintf(stderr, "Usage: %s ssd_devname\n", pname); #ifdef COMMIT_REV fprintf(stderr, "git commit: %s\n", COMMIT_REV); #endif exit(1); } char *pname; char *sb_buf; char *buf; int main(int argc, char **argv) { int cache_fd, c; char *ssd_devname; struct flash_superblock *sb; u_int64_t md_block_bytes = 0; u_int64_t md_slots_per_block = 0; u_int64_t cache_size = 0; int dirty_blocks = 0; pname = argv[0]; while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = 1; break; case '?': usage(pname); } } if (optind == argc) usage(pname); ssd_devname = argv[optind++]; cache_fd = open(ssd_devname, O_RDWR); if (cache_fd < 0) { fprintf(stderr, "Failed to open %s\n", ssd_devname); exit(1); } lseek(cache_fd, 0, SEEK_SET); sb_buf = (char *)malloc(512); if (!sb_buf) { fprintf(stderr, "Failed to allocate sector buffer\n"); exit(1); } if (read(cache_fd, sb_buf, 512) < 0) { fprintf(stderr, "Cannot read Flashcache superblock %s\n", ssd_devname); exit(1); } sb = (struct flash_superblock *)sb_buf; if (!(sb->cache_sb_state == CACHE_MD_STATE_DIRTY || sb->cache_sb_state == CACHE_MD_STATE_CLEAN || sb->cache_sb_state == CACHE_MD_STATE_FASTCLEAN || sb->cache_sb_state == CACHE_MD_STATE_UNSTABLE)) { fprintf(stderr, "%s: No valid Flashcache found on %s\n", pname, ssd_devname); exit(1); } /* Backwards compat, versions < 2 use a 1 sector metadata blocksize */ if (sb->cache_version == 1) sb->md_block_size = 1; cache_size = sb->size; md_block_bytes = sb->md_block_size * 512; lseek(cache_fd, md_block_bytes, SEEK_SET); /* lseek past the superblock to first MD slot */ md_slots_per_block = (md_block_bytes / (sizeof(struct flash_cacheblock))); buf = (char *)malloc(md_block_bytes); if (!buf) { fprintf(stderr, "Failed to allocate sector buffer\n"); exit(1); } while (cache_size > 0 && dirty_blocks == 0) { struct flash_cacheblock *next_ptr; int j, slots_read; if (cache_size < md_slots_per_block) slots_read = cache_size; else slots_read = md_slots_per_block; if (read(cache_fd, buf, md_block_bytes) < 0) { fprintf(stderr, "Cannot read Flashcache metadata %s\n", ssd_devname); exit(1); } next_ptr = (struct flash_cacheblock *)buf; for (j = 0 ; j < slots_read ; j++) { if (next_ptr->cache_state & DIRTY) { dirty_blocks++; break; } next_ptr++; } cache_size -= slots_read; } if (dirty_blocks && !force) { fprintf(stderr, "%s: DIRTY BLOCKS EXIST ON %s, ABORTING CACHE DESTROY\n", pname, ssd_devname); fprintf(stderr, "%s: Use -f (force) to destroy cache with DIRTY blocks, BUT YOU WILL LOSE DATA GUARANTEED\n", pname); fprintf(stderr, "%s: To clean the DIRTY blocks, flashcache_load, then do_sync until all dirty blocks are cleaned\n", pname); exit(1); } fprintf(stderr, "%s: Destroying Flashcache found on %s. Any data will be lost !!\n", pname, ssd_devname); sb->cache_sb_state = 0; lseek(cache_fd, 0, SEEK_SET); if (write(cache_fd, sb_buf, 512) < 0) { fprintf(stderr, "Cannot write Flashcache superblock %s\n", ssd_devname); exit(1); } return 0; } flashcache-3.1.3+git20150701/src/utils/flashcache_load.c000066400000000000000000000126431254507146700223720ustar00rootroot00000000000000/* * Copyright (c) 2010, Facebook, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include char buf[512]; char dmsetup_cmd[8192]; int verbose = 0; void usage(char *pname) { fprintf(stderr, "Usage: %s ssd_devname [cachedev]\n", pname); #ifdef COMMIT_REV fprintf(stderr, "git commit: %s\n", COMMIT_REV); #endif exit(1); } static int module_loaded(void) { FILE *fp; char line[8192]; int found = 0; fp = fopen("/proc/modules", "ro"); while (fgets(line, 8192, fp)) { char *s; s = strtok(line, " "); if (!strcmp(s, "flashcache")) { found = 1; break; } } return found; } static void load_module(void) { if (module_loaded()) { if (verbose) fprintf(stderr, "Flashcache Module already loaded\n"); return; } if (verbose) fprintf(stderr, "Loading Flashcache Module\n"); system("modprobe flashcache"); if (!module_loaded()) { fprintf(stderr, "Could not load Flashcache Module\n"); exit(1); } } int main(int argc, char **argv) { int c, cache_fd, disk_fd; char *pname; char *disk_devname, *ssd_devname, *cachedev; struct flash_superblock *sb = (struct flash_superblock *)buf; sector_t disk_devsize, cache_devsize; int ret; int cache_mode; pname = argv[0]; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = 1; break; case '?': usage(pname); } } if ((argc < 2) || (argc > 4)) { usage(pname); } ssd_devname = argv[optind++]; cache_fd = open(ssd_devname, O_RDONLY); if (cache_fd < 0) { fprintf(stderr, "Failed to open %s\n", ssd_devname); exit(1); } lseek(cache_fd, 0, SEEK_SET); if (read(cache_fd, buf, 512) < 0) { fprintf(stderr, "Cannot read Flashcache superblock %s\n", ssd_devname); exit(1); } if (!(sb->cache_sb_state == CACHE_MD_STATE_DIRTY || sb->cache_sb_state == CACHE_MD_STATE_CLEAN || sb->cache_sb_state == CACHE_MD_STATE_FASTCLEAN || sb->cache_sb_state == CACHE_MD_STATE_UNSTABLE)) { fprintf(stderr, "%s: Invalid Flashcache superblock %s\n", pname, ssd_devname); exit(1); } if ((strncmp(sb->cache_devname, ssd_devname, DEV_PATHLEN) == 0) && (argc == 2)) { fprintf(stderr, "%s: Upgrading older v2 superblock format, please supply cachedev virtual device name\n", pname); usage(pname); } // switch to new vdev name if requested by load command if (optind == argc) { cachedev = sb->cache_devname; } else { cachedev = argv[optind]; } disk_devname = sb->disk_devname; disk_fd = open(disk_devname, O_RDONLY); if (disk_fd < 0) { fprintf(stderr, "%s: Failed to open %s\n", pname, disk_devname); exit(1); } if (ioctl(cache_fd, BLKGETSIZE, &cache_devsize) < 0) { fprintf(stderr, "%s: Cannot get cache size %s\n", pname, ssd_devname); exit(1); } if (ioctl(disk_fd, BLKGETSIZE, &disk_devsize) < 0) { fprintf(stderr, "%s: Cannot get disk size %s\n", pname, disk_devname); exit(1); } if (cache_devsize != sb->cache_devsize) { fprintf(stderr, "%s: Cache size mismatch, expect %lu, given %lu\n", pname, sb->cache_devsize, cache_devsize); exit(1); } if (disk_devsize != sb->disk_devsize) { fprintf(stderr, "%s: Disk size mismatch, expect %lu, given %lu\n", pname, sb->disk_devsize, disk_devsize); exit(1); } /* * Device Names and sizes match the ones stored in the cache superblock, * Go ahead and load the cache. * XXX - Should use the device mapper library for this. */ cache_mode = FLASHCACHE_WRITE_BACK; sprintf(dmsetup_cmd, "echo 0 %lu flashcache %s %s %s %d 1 | dmsetup create %s", disk_devsize, disk_devname, ssd_devname, cachedev, cache_mode, cachedev); load_module(); if (verbose) fprintf(stderr, "Loading FlashCache Volume : %s\n", dmsetup_cmd); ret = system(dmsetup_cmd); if (ret) { fprintf(stderr, "%s failed\n", dmsetup_cmd); exit(1); } return 0; } flashcache-3.1.3+git20150701/src/utils/flashcache_scan000077500000000000000000000007741254507146700221630ustar00rootroot00000000000000#!/bin/sh PREREQ="mdadm udev" prereqs() { echo "$PREREQ" } case $1 in prereqs) prereqs exit 0 ;; esac . /scripts/functions log_begin_msg "Scanning for flashcache devices" echo "Waiting for udev to settle..." /sbin/udevadm settle --timeout=30 PARTITIONS=`cat /proc/partitions | awk '{ print $NF; }' | grep -v name` for P in $PARTITIONS; do if /sbin/flashcache_load "/dev/$P" 2> /dev/null; then echo "Loaded flashcache device from /dev/$P" fi done log_end_msg "Flashcache scanning done." exit 0 flashcache-3.1.3+git20150701/src/utils/flashcache_setioctl.c000066400000000000000000000072311254507146700232760ustar00rootroot00000000000000/* * Copyright (c) 2012, Dmitry Golubev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include void usage(char *pname) { fprintf(stderr, "Usage: %s (-c | -a | -r) (-b pid |-w pid) ssd_devname \n", pname); exit(1); } int main(int argc, char **argv) { int cache_fd, c, result; char action = ' ', list = ' ', *cachedev, *pname = argv[0]; intmax_t pidmax; char *tmp; pid_t pid; while ((c = getopt(argc, argv, "carb:w:")) != -1) { switch (c) { case 'c': action = 'c'; break; case 'a': action = 'a'; break; case 'r': action = 'r'; break; case 'b': list = 'b'; pidmax = strtoimax(optarg, &tmp, 10); if(tmp == optarg || *tmp != '\0' || pidmax != (pid_t)pidmax) { fprintf(stderr, "Bad PID!\n"); exit(1); } else { pid = (pid_t)pidmax; } break; case 'w': list = 'w'; pidmax = strtoimax(optarg, &tmp, 10); if (tmp == optarg || *tmp != '\0' || pidmax != (pid_t)pidmax) { fprintf(stderr, "Bad PID!\n"); exit(1); } else { pid = (pid_t)pidmax; } break; case '?': usage(pname); } } if (action == ' ') usage(pname); if (list == ' ') usage(pname); if (optind == argc) usage(pname); cachedev = argv[optind++]; cache_fd = open(cachedev, O_RDONLY); if (cache_fd < 0) { fprintf(stderr, "Failed to open %s\n", cachedev); exit(1); } if (list == 'w') { switch (action) { case 'a': result=ioctl(cache_fd, FLASHCACHEADDWHITELIST, &pid); break; case 'r': result=ioctl(cache_fd, FLASHCACHEDELWHITELIST, &pid); break; case 'c': result=ioctl(cache_fd, FLASHCACHEDELALLWHITELIST, &pid); break; } } else { switch (action) { case 'a': result=ioctl(cache_fd, FLASHCACHEADDBLACKLIST, &pid); break; case 'r': result=ioctl(cache_fd, FLASHCACHEDELBLACKLIST, &pid); break; case 'c': result=ioctl(cache_fd, FLASHCACHEDELALLBLACKLIST, &pid); break; } } close(cache_fd); if (result < 0) { fprintf(stderr, "ioctl failed on %s\n", cachedev); exit(1); } return 0; } flashcache-3.1.3+git20150701/src/utils/get_agsize.c000066400000000000000000000060671254507146700214360ustar00rootroot00000000000000/* * Copyright (c) 2012, Facebook, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #define _GNU_SOURCE #include void usage(char *pname) { fprintf(stderr, "%s: [-v] cache_size(in GB) vol_size(in GB)\n", pname); exit(1); } int main(int argc, char *argv[]) { size_t csize, vsize, agsize, t1, t2, diff, best_agcount = 1; int agcount; int c, verbose = 0; char *pname; pname = argv[0]; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = 1; break; case '?': usage(pname); } } if (optind == argc) usage(pname); csize = strtoul(argv[optind++], NULL, 0); if (optind == argc) usage(pname); if (!csize || csize == ULONG_MAX) usage(pname); vsize = strtoul(argv[optind], NULL, 0); if ( !vsize || vsize < csize ||vsize == ULONG_MAX) usage(pname); csize *= 1024; vsize *= 1024; /* convert to MiB */ diff = ULONG_MAX; for (agcount = 1; agcount < 30; agcount++) { t2 = csize / agcount; agsize = vsize / agcount; /* Max agsize is 1TB, find another agcount */ if (agsize >= 1024 * 1024) continue; /* agsize < 16GB, terminate search */ if (agsize < 16 * 1024) break; if (agsize < csize) t2 = (((double)(agcount - 1)) / agcount) * csize; t1 = agsize % csize; if (abs(t1 - t2) < diff) { diff = abs(t1 - t2); best_agcount = agcount; } if (verbose) printf("agsize = %ld agcount = %d, t1=%d t2=%d\n", agsize/1024, agcount, t1/1024, t2/1024); } printf("best agsize = %ld agcount=%d\n", vsize / (best_agcount * 1024), best_agcount); return 0; } flashcache-3.1.3+git20150701/utils/000077500000000000000000000000001254507146700163515ustar00rootroot00000000000000flashcache-3.1.3+git20150701/utils/dracut-flashcache-0.3-1.el6.noarch.rpm000066400000000000000000000374501254507146700250350ustar00rootroot00000000000000dracut-flashcache-0.3-1.el6T>D ,0@ee680a867d5093ff5c59c3783dd1d8869a95b910>Pnf$XsE<>7 ? d  aHL`dos        P       Pt B(`8h9t:GX H I XY\ ] ^b Ld e f l t u  v Hw x y  Cdracut-flashcache0.31.el6Dracut modules to build a dracut initramfs with flashcache supportThis package enables creating and starting flashcache in initrd so you can use flashcache on your root filesystem or physical volumes. It also enables using flashcache on other disks which are set up during rc.sysinit (via udev).RQ{trinity.lokaal.netmChrysocomeGPLJohn Newbign (jnewbigin@chrysocome.net)System Environment/Baselinuxnoarch FC'OꁤA큤AA큤RQ{RQ{RQ{O+PRQ{RQ{RQ{RQ{RQ{RQ{RQ{1392c9c7d671712b64073d95d4408e241c92b2a81f108cb62878fb34473f354a8ca63b58ac6c47a875d331a3587d36c01119d82f25a9de28789d805518be7c35ab15fd526bd8dd18a9e77ebc139656bf4d33e97fc7238cd11bf60e2b9b8666c612bd3d02b0c28d87cf0a11d6a9778dda99ea4723cc6d7195144f84f4247c9ce6b21aae5b07c339d7cbe91e0ff082bb85d0ddef10d3a39ea5e501f2b9347ac52ea68265f29fea1ac21cc98282d8ff628eac9230c54b5de62f15c48e13cccccfb9631505d72f353d1944e1f7929478a8a37b98f9941a2fa1068c1c5e1676d18bff839ee35761e7d9263848e9fa37d4f794b069d505c9c0061c755e27f218ae9d8ec8f228e087b83e5657e3d96e08502386917e07ca614760fdb99f3577ae71ec1arootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootrootdracut-flashcache-0.3-1.el6.src.rpmdracut-flashcache@    /bin/shdracutrpmlib(CompressedFileNames)rpmlib(FileDigests)rpmlib(PayloadFilesHavePrefix)rpmlib(PayloadIsXz)3.0.4-14.6.0-14.0-15.2-14.8.0P@P_@OQJohn Newbign - 0.3John Newbigin - 0.2John Newbigin - 0.1- Fix up a few fc_scan bugs - Add mode none- Add scripts for the real udev- First cut 0.3-1.el610-flashcache.rulesfc_scandracut-flashcache-0.3COPYINGREADMEmodules.d90flashcache63-flashcache.rulesfc_scaninstallinstallkernelparse-flashcache.sh/lib/udev/rules.d//sbin//usr/share/doc//usr/share/doc/dracut-flashcache-0.3//usr/share/dracut//usr/share/dracut/modules.d//usr/share/dracut/modules.d/90flashcache/-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=genericcpioxz2noarch-redhat-linux-gnuASCII textPOSIX shell script text executabledirectoryRR?7zXZ !PH6;/5] b2u Q{J('_fW$Tuvؠ?'}X(VJJ4 `.Xb֩ՁֹN+#Pq[*aڄA VC_"!;Ɍs+wTLzr;U1M%l6o~76`S^qM.BE[5:̏"R?V0X]$.og7!;hPp?R4Vy42ZȼKU\wGyĎ L)Kd}) ĪD5]@'Ifp16˭BhtzAn*~#NO=w رO>5329ʘ-CfKTlJ'KX*cmĝmBus;uO疅[uAgʿ~˅i|T_yjѣ-,-jjս€VEm~zKc\oUFL9R6ābbGnJms~86K'2WDȖ6E, KaE= 麗Y sD2p.?Nvv,9諃GEFUC)S.0_Pr՝W?Fm:G|Tj%]&$Fl_xFxLM&ѻ3;4yLS(/~5ΚG&}2qFV6A)'@mJzHm'VmW<2ȭXyWh3fO;mؓ 1gZ7/iouXEI}x]%C !0|bXs4u׵U胍|y\ت(N-:De2f69J(dP|"w$#g:ak;l łHNj9cJʓ:%⾀Xz7x+7àUtlHiڄOnA},U42QOWUי]Y}TRUێ/sDYCc/"Q򴫟]RicƛksohطwW }GrDw#Hbl RAfҨ1w =f7;mPyca+BD}b3#Lu+3-bš4 槙Ip~1+ '˭CIo{ /+ ; 8$a1؞ʓWZ< Y֌Y;( 'B?m#IU@-G3B.q bʴ?]`̀EZmia5fz/!ɈMZCqa'g=wD7Gۧb2z3"YƘC1G4N@UOdE4@?qS7sŝeʧ*$l%VdO \]H@͕7i7 P#|z,;7MtWgbu[#WC ~_ݹ,,¯6>9umzxI#nD^-$/pr@VmO'%RBxnoYy#,~H]5۱8Q:9.~RVͰE.徸UF )1~̋f8m3[F0z@EH=:?"enGm!BzY. $]fJL&t|b4r6|tS:]CFIx_sUd0cŷ6je.5X)(;^ a^)#qqhUYli[^(A bFj^i~'aʗ|}soٞV.1sI6{LOz@diHjСU:|-'1$6SOqmQS<֝֙˔QqGңKÑai#y6,\&F} NUdλ*7kc_hV ltpʓXs \޶pƦay|,)c'qWHd1UXŧ "h>gQdϩ1MF="'ɱy+,;8f+3c.-$zEņ*ruc _2k#e긊Y㋱$38zf4+5Ƨl4AnmneAnor@&'LaAѧ!T@P{r6Uoi#~g84WKPM:1]}:bust˘bМU]Ġ> z3D9gHI db;| GhTߙ@`%4Z/Y՗BAaۆ$?~̼GDs}vGi[5hFIHҵzDLN|ňx*jқKEDr;_Hռggl~7@ThDl %Uw`H%I&:]#g4}'FzV5exo'.qSa;ZnlC-I@b]MTZdW}.( el{A) x(ugL(A ]m!S~Rw% %^?5w sI͑{ @Xڔ$ىt)*`27; kuֲ2/[(Gb6$wFB))8#[3B$/-cFQ mX\=*鎇 rܙ[#ɧ'I.mX0q[@<9FSC?_(BȂYڔόq1 nZ`,BD#⦒V>Ts̵h(/x1hiu w;aF$ upcHEI[uOl{__+Y`LKݱE׽?3aĥ>b}"݄bJܤyq5|&3#,_o6Ue5) uF4&vӞ?S{o417ƃ#t{L},D386Wgȅ; F8hB \tVRbaNIR %75.ԟIzk}15֒X6|f=:(PU3扱dJ)д|M?B|´ A=̦MATCwlRG}i"u*ytf tzO nYZf InY+ȏ)پ!4'yQTy^U9XsE L>[Es̓e+m?N'0ExtFo$q{wX֋I-~ ĬUx8BCFLvO |g:]p%v!IXA25Ce[6#U`ziȆ  !5ů[XBC೪h~'Λ~QEΨ}nfvhxHdt?=8L=]Q[ Orݦ>RBAvC,)tM*rWfg`f=\@z,G~4ĕ<,ozbF\me%J=7 AOLcnj9ʜnd2>uiĻX`<oK߈N]y8P[#%䊕֖^ݔ"ؙ%fdzvƅ0s.0S o~V ٌ Έdž& xdjǶDSTf%.3o⩯ n m<BD]h-o}z&++ӂciXlHi+g"sY/6Z;'ޱP]mBT\땷EvD#f<UBNnK!*FpL3Nw ){9.] dtiNk,o*eKS83xɷcjWw5Y"^ U/4";ċk: sA"3wn*|*ږބ6 lDnhZ|ک;勡vE G d$?!Ѭ" d>֋a9 hx>~AnǾaQIC>h;a&5Cw )Uj@ij+a{vc)y'6.5Ac3BX$6jvwBPRk{4R,Z->cM,ūuҚNNZ ~K+_v^GcNI(0m2o#[2.nS?!z5쿰!$U8ر?naK`M*Fv&h )$1p@&RÔ av%LOA!/3>4CvEBJǪ{i)<@bPk,պe̦/j:<(nZ4C_^ 7d7S]Th a3n)usL. WZ7V;B,-PUu{4QFAR 9;ۏf,ʹ6ZoϦ}(Y1VZ}ժׂTiD2G>;4eہyYoz"w;gzyw+AHu)ѵ6õ`HbdNh.Wkm_t8&BSO7ywE}- EоkP~L5ga$2]H҈6 :ʆށ]19o(3: 'ESLf ,u:]^>-}1VPO쀩Q:f' w.mq87YKP Spge?U iM^e2*qSf*2)]HzhM>xT٤F Ha -LCY/yrxTyBH>&b9+ ̤׺ 8i^hyղF[Oiu긂l޽y*"a:jqCfAj:*^Izr(nO~h<6}p/9quo»I6Σ6+|-Ch~ࠋ"p:hXjgKup(=^Tak Xn\.#zUfu}ՁCn'Fx Ƞ68uLD;:WCd<*pJޗS.=64J+^4`{.zl_=yFrʟ&o亵}nl|3ޑE+QaeX!M,E~}V'IzbfmtK?6aw8!`"CC7-*e˕/ǂ/Ejϝ8E `MGœchyj>ApK7_VY<^̿ "+zr ) *7B00VP?VMh/JFᢠ@MA }[Žw4 ץqrLѼy/|k{&n7Z&u~(XCΏK\6JXN˸:5qIg$Rt;{ZHhIoYg?Dөs Y?:H_M-.Hkk\M6⑃.;Uf?#$'J'8#:(k< fT $(=%ERO@ ,&^ p1r>+_rk F̀G%F~̷=r8 ߞRoL cSX`IbY91]vRZ5@ b^SgfH~[3ir [}]-'`(W2 q z]?:a%ѭA*j(4C/t-D XљP-Obn (a^ D4xf*MJ,GAYN1ZG 0buՌamv3֛QG FW"DZ\*y&۾3_Z疽vm"-dX^LԽfBΩ`Uþ.@{孩G:2V酓I8O쬣 x*|:BG“Y橫!v8I>J"qq3BBo+rY>¶ m̕W7cS}?.m\g>!A[%lTN` WrIX/6m`NU"X&&o\V-β[eʍl6d4̚WnY(ɕ5,8r/p~ ypF Ko/i&_f~I/:gH4{~3F +AV)wz8ql2H~ty%ţkvƑmF NhE*#*!Wgj.y PTFxI [*.'pW=Y{7eacu`rSFeUe"5MBlQyٷw{Xd:WjoMMƠBl9~|dU;d[mcCtC6}jP7 ʛd'!݂S'{s F1IOp'Zugbљ#cw4?mxG"w;;-L' ~P-n rdFDjrmJq-W& aʹX_Wzv f`5;tz7&4e~ҿCI¡$\wy8~D'KEонL !Eau>y@J衕WZwy &"VޑAY+TĠ*.s}9M vC60v)OKK@YSτ[VBS`r&~Hj1y6 oq* r/+"X5ʔ kv^zH}Nn+^G52##z''=Dd`ЄJTpÀ-^VαRCu.ŕ~X"eP Xics9ytx̪Չ;%͘3ΩO f4 F_rCwC,;~# 'O%X<<ĊSӂ.~=G B5i>m 9U3^ FGCz!9'Uٗ eiR1af>:_En7c>Sb{ IL@oJ=~VŵNv7ƻi>crS7lƚw5Ejq;&(KUG,IX2~:|M֚ E%zC 7nl/`h:9)Sd.Z;7Iw8u@6σse*1δi~Bط^Ƥ w;EΖyLh+UcfDwK`4 wڵ{ GM]UI3ܵ{dA}C#jU2Nc ssȋ oag{s`<#=Yִ)Z\}`rEeLɓ}jv $%+2Dⷪ_uSakJEU)SwoL%gGީ^M@F*(V$V_X r_SjuȴI*tdOHǚc Wf#,Q23 "Q?{ N*̨Qs] ֝2\A'C=)`^(, Tf1UAakl|a>6-ҪPN޲‹(`RP 'C(k9Iz _^rmpDQǺyiVҜ@t.67ȵw?T~a~?G򎎬F׉՝Rlk$ƒخ ſZ̤ Z",(Y0dN[4'P ʷ3KIdd\u$?0bUGƞA1mۖ!6fEJK :tHHS;Ɩ>FőoT)@8X7+!$!6{己Cc-utg%y3n[i4txJ7o+p+JձrP4L{&(Au5|Cu8CCЇ Zt@RrG4qisbc*G |k,R cl 4v|u bd'm\ ⎸ι* r`)Sɉ8< Fi #t}VC &heNu׵n0Qn/cMAh7PFsS'pwe(i" p,@9Qފ)1GcLixʷNORvWh?QAOJzw"y!mYC0$?N AO1 L)t/*5Lqt['Ńeݹ%L%^j_\2L ?ؗ?crCXqe0"x4wR׆WZ&p`]rZGOEKYg?җP)^ = YZflashcache-3.1.3+git20150701/utils/flashcache000077500000000000000000000041341254507146700203620ustar00rootroot00000000000000#!/bin/bash # # flashcache Init Script to manage cachedev loads # # chkconfig: 345 9 98 # description: Flashcache Management # Flashcache options # modify this before using this init script SSD_DISK= BACKEND_DISK= CACHEDEV_NAME= MOUNTPOINT= FLASHCACHE_NAME= # Just a check, to validate the above params are set [ -z "$SSD_DISK" ] && exit 10 [ -z "$BACKEND_DISK" ] && exit 11 [ -z "$CACHEDEV_NAME" ] && exit 12 [ -z "$MOUNTPOINT" ] && exit 13 [ -z "$FLASHCACHE_NAME" ] && exit 14 # Source function library. . /etc/rc.d/init.d/functions #globals DMSETUP=`/usr/bin/which dmsetup` SERVICE=flashcache FLASHCACHE_LOAD=/sbin/flashcache_load SUBSYS_LOCK=/var/lock/subsys/$SERVICE RETVAL=0 start() { echo "Starting Flashcache..." #Load the module /sbin/modprobe flashcache RETVAL=$? if [ $RETVAL -ne 0 ]; then echo "Module Load Error: flashcache. Exited with status - $RETVAL" exit $RETVAL fi #flashcache_load the cachedev $FLASHCACHE_LOAD $SSD_DISK $CACHEDEV_NAME RETVAL=$? if [ $RETVAL -ne 0 ]; then echo "Failed: flashcache_load $SSD_DISK $CACHEDEV_NAME" exit $RETVAL; fi #mount if [ -L /dev/mapper/$CACHEDEV_NAME ]; then /bin/mount /dev/mapper/$CACHEDEV_NAME $MOUNTPOINT RETVAL=$? if [ $RETVAL -ne 0 ]; then echo "Mount Failed: /dev/mapper/$CACHEDEV_NAME to $MOUNTPOINT" exit $RETVAL fi else echo "Not Found: /dev/mapper/$CACHEDEV_NAME" exit 1 fi #lock subsys touch $SUBSYS_LOCK } stop() { #unmount /bin/umount $MOUNTPOINT #check for force flag FLAG=0 [ "$1" == '--force' ] && FLAG=1 /sbin/sysctl -w dev.flashcache.$FLASHCACHE_NAME.fast_remove=$FLAG echo "Flushing flashcache: Flushes to $BACKEND_DISK" $DMSETUP remove $CACHEDEV_NAME #unlock subsys rm -f $SUBSYS_LOCK } status() { [ -f $SUBSYS_LOCK ] && echo "Flashcache status: loaded" || echo "Flashcache status: NOT loaded"; $DMSETUP status $CACHEDEV_NAME exit $? } case $1 in start) start ;; stop) stop ;; status) status ;; forcestop) stop --force ;; *) echo "Usage: $0 {start|stop|status}" exit 1 esac exit 0 flashcache-3.1.3+git20150701/utils/flashstat000077500000000000000000000355651254507146700203060ustar00rootroot00000000000000#!/usr/bin/perl -w #################################################################################### # creator: NinGoo # Description: a tool for flashcache status per seconds # created: 2012-01-04 # version: 0.3 # modified: # 2012-01-10 NinGoo version 0.2 add --nocolor option # 2012-01-11 NinGoo version 0.3 parse /proc/../flashcache_stats instead of dmsetup status # ##################################################################################### use POSIX qw(strftime); use strict; use Getopt::Long; use Term::ANSIColor; use File::Basename; Getopt::Long::Configure qw(no_ignore_case); $SIG{TERM} = $SIG{INT} = \&reset_color; sub reset_color { print YELLOW(),"\nExit Now...\n\n", RESET(); exit; } my %opt; # option parameters my %result; # result for status per seconds my %hit; # hit percent my %sysctl; # sysctl parameters my %dmsetup_table; # dmsetup table info my %status; # flashcache status info my $dev = "/dev/mapper/cachedev"; my $flashcache_stats_old_version = "/proc/flashcache_stats"; # for old version, this path is right my $flashcache_stats_new_version; # for new version, get path in get_sysctl() my $format_title = "%14s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %6s %6s %6s\n"; my $interval = 1; $interval = 5 if(-f $flashcache_stats_old_version); # for old version ,using dmsetup status, default interval set to 5s my $count = 0; get_options(); get_dmsetup_table(); get_sysctl(); get_status(); print_header(); my $n = 0; while(1){ if($n % 20 == 0){ # print title every 20 lines print YELLOW(), BOLD(); printf $format_title, "time", "read/s", "write/s", "diskr/s", "diskw/s", "ssdr/s", "ssdw/s", "uread/s", "uwrit/s", "metaw/s", "clean/s","repl/s","wrepl/s", "hit%", "whit%", "dwhit%"; print RESET(); } my %status_old = %status; sleep($interval); get_status(); # calculate status per second foreach (keys(%status)){ $result{$_} = ($status{$_} - $status_old{$_}) / ($interval + 0.00001); } # calculate hit percent $result{read_hit_percent} = sprintf "%d", ($result{read_hits} * 100) / ($result{reads} + 0.0001); $result{write_hit_percent} = sprintf "%d", ($result{write_hits} * 100)/ ($result{writes} + 0.0001); $result{dirty_write_hit_percent} = sprintf "%d", ($result{dirty_write_hits} * 100) / ($result{writes} + 0.0001); # print value print YELLOW(); printf "%14s ", get_current_time(); print RESET(); $result{reads} > 10000 ? print RED() : print WHITE(); printf "%7d ", $result{reads} and print RESET(); $result{writes} > 10000 ? print RED() : print WHITE(); printf "%7d ", $result{writes} and print RESET(); $result{disk_reads} > 1000 ? print RED() : print GREEN(); printf "%7d ", $result{disk_reads} and print RESET(); $result{disk_writes} > 1000 ? print RED() : print GREEN(); printf "%7d ", $result{disk_writes} and print RESET(); $result{ssd_reads} > 10000 ? print RED() : print WHITE(); printf "%7d ", $result{ssd_reads} and print RESET(); $result{ssd_writes} > 10000 ? print RED() : print WHITE(); printf "%7d ", $result{ssd_writes} and print RESET(); $result{uncached_reads} > 100 ? print RED() : print GREEN(); printf "%7d ", $result{uncached_reads} and print RESET(); $result{uncached_writes} > 100 ? print RED() : print GREEN(); printf "%7d ", $result{uncached_writes} and print RESET(); $result{metadata_ssd_writes} > 100 ? print RED() : print WHITE(); printf "%7d ", $result{metadata_ssd_writes} and print RESET(); $result{cleanings} > 100 ? print RED() : print WHITE(); printf "%7d ", $result{cleanings} and print RESET(); $result{replacement} > 100 ? print RED() : print WHITE(); printf "%7d ", $result{replacement} and print RESET(); $result{write_replacement} > 100 ? print RED() : print WHITE(); printf "%7d ", $result{write_replacement} and print RESET(); $result{read_hit_percent} < 90 ? print RED() : print GREEN(); printf "%6s ", $result{read_hit_percent}."|".$hit{read_hit_percent} and print RESET(); $result{write_hit_percent} < 90 ? print RED() : print GREEN(); printf "%6s ", $result{write_hit_percent}."|".$hit{write_hit_percent} and print RESET(); $result{dirty_write_hit_percent} < 90 ? print RED() : print GREEN(); printf "%6s ", $result{dirty_write_hit_percent}."|".$hit{dirty_write_hit_percent} and print RESET(); print "\n"; print RESET(); $n++; exit if($count > 0 && $n >= $count); } ############################################################## # get sysctl parameter of flashcache ############################################################## sub get_sysctl{ chomp(my $tmp = `sudo /sbin/sysctl -a | grep flashcache`); my @lines = split(/\n/, $tmp); foreach my $line (@lines){ if($line =~ /\+/){ # for new version of flashcache sysctl has per ssd+disk dev parameter my $dev_device = basename($dmsetup_table{ssd_dev})."+".basename($dmsetup_table{disk_dev}); $dev_device =~ s/\/dev\///g; $flashcache_stats_new_version = "/proc/flashcache/".$dev_device."/flashcache_stats"; next if($line !~ /\Q$dev_device\E/); } if($line =~ /cache_all/){ $sysctl{cache_all} = (split(/=/, $line))[1]; $sysctl{cache_all} =~ s/^\s+//; } elsif($line =~ /reclaim_policy/){ my $policy = (split(/=/, $line))[1]; $policy =~ s/\s+//; $sysctl{reclaim_policy} = $policy eq '0'? 'FIFO' : 'LRU'; } elsif($line =~ /dirty_thresh_pct/){ $sysctl{dirty_thresh_pct} = (split(/=/, $line))[1]; $sysctl{dirty_thresh_pct} =~ s/^\s+//; } elsif($line =~ /max_clean_ios_set/){ $sysctl{max_clean_ios_set} = (split(/=/, $line))[1]; $sysctl{max_clean_ios_set} =~ s/^\s+//; } elsif($line =~ /max_clean_ios_total/){ $sysctl{max_clean_ios_total} = (split(/=/, $line))[1]; $sysctl{max_clean_ios_total} =~ s/^\s+//; } } } ############################################################## # get status for flashcache device, using /proc/../flashcache instead of dmsetup status ############################################################### sub get_status{ if(-f $flashcache_stats_old_version){ get_dmsetup_status(); return; } # new version using /proc/flashcache/dev+dev/flashcache_stats if(defined($dmsetup_table{ssd_dev}) && defined($dmsetup_table{disk_dev})){ my @stats = split('\s+', `cat $flashcache_stats_new_version`); foreach (@stats){ my @kv = split('=', $_); $status{$kv[0]} = $kv[1]; } $hit{read_hit_percent} = $status{read_hit_percent} if(defined($status{read_hit_percent})); $hit{write_hit_percent} = $status{write_hit_percent} if(defined($status{write_hit_percent})); $hit{dirty_write_hit_percent} = $status{dirty_write_hit_percent} if(defined($status{dirty_write_hit_percent})); } } ############################################################## # get dmsetup status for flashcache device, for old version use only ############################################################### sub get_dmsetup_status{ my $flag = 0; chomp(my $tmp = `sudo dmsetup status $dev`); my @lines = split(/\n/,$tmp); foreach my $line(@lines){ $line =~ s/^\s+//g; if($line =~ m/^reads\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{reads} = $1; $status{writes} = $2; } elsif($line =~ m/^disk reads\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+).*\((\d+).*\((\d+)/g; $status{disk_reads} = $1; $status{disk_writes} = $2; $status{ssd_reads} = $3; $status{ssd_writes} = $4; } elsif($line =~ m/^uncached reads\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+).*\((\d+)/g; $status{uncached_reads} = $1; $status{uncached_writes} = $2; $status{uncached_requeue} = $3; } elsif($line =~ m/^metadata batch\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{metadata_ssd_writes} = $2; } elsif($line =~ m/^cleanings\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{cleanings} = $1; } elsif($line =~ m/^replacement\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{replacement} = $1; $status{write_replacement} = $2; } elsif($line =~ m/^read hits\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{read_hits} = $1; $hit{read_hit_percent} = $2; } elsif($line =~ m/^write hits\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{write_hits} = $1; $hit{write_hit_percent} = $2; } elsif($line =~ m/^dirty write hits\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/g; $status{dirty_write_hits} = $1; $hit{dirty_write_hit_percent} = $2; } } exit if($flag == 0); } ############################################################## ## get dmsetup table for flashcache device ############################################################### sub get_dmsetup_table{ my $flag = 0; chomp(my $tmp = `sudo dmsetup table $dev`); my @lines = split(/\n/, $tmp); foreach my $line (@lines){ $line =~ s/^\s+//g; if($line =~ m/^ssd dev \(.*\)/){ $flag = 1; if($line =~ /cache mode/){ # for new version of flashcache, get cache mode $line =~ m/cache mode\((\w+)/; $dmsetup_table{cache_mode} = $1; } #$line =~ m/(\/\w+\/\w+).*\((\/\w+\/\w+)/; # bugfix for Issue #1 of https://github.com/NinGoo/flashstat $line =~ m/(\/[^\s]{1,})\).*\((\/[^\s]{1,})\)/; $dmsetup_table{ssd_dev} = $1; $dmsetup_table{disk_dev} = $2; } elsif($line =~ m/^capacity\(.*\)/){ $flag = 1; if($line =~ /metadata block size\(.*\)/){ $line =~ m/(\d+\w).*\((\d+).*\((\d+\w).*\((\d+\w)/; $dmsetup_table{capacity} = $1; $dmsetup_table{associativity} = $2; $dmsetup_table{block_size} = $3; $dmsetup_table{metadata_block_size} = $4; } else{ $line =~ m/(\d+\w).*\((\d+).*\((\d+\w)/; $dmsetup_table{capacity} = $1; $dmsetup_table{associativity} = $2; $dmsetup_table{block_size} = $3; } } elsif($line =~ m/^total blocks\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+).*\((\d+)/; $dmsetup_table{total_blocks} = $1; $dmsetup_table{cached_blocks} = $2; $dmsetup_table{cached_percent} = $3; } elsif($line =~ m/^dirty blocks\(.*\)/){ $flag = 1; $line =~ m/(\d+).*\((\d+)/; $dmsetup_table{dirty_blocks} = $1; $dmsetup_table{dirty_percent} = $2; } elsif($line =~ /skip sequential thresh\(.*\)/){ $flag = 1; $line =~ /(\d+\w)/; $dmsetup_table{skip_sequential_thresh} = $1; } } exit if($flag == 0); } ############################################################## # get current time ############################################################### sub get_current_time{ return strftime("%m-%d %H:%M:%S",localtime); } ############################################################## ## get option ############################################################### sub get_options{ GetOptions(\%opt, 'h|help', 'i|interval=i', 'c|count=i', 'd|device=s', 'n|nocolor', ); $opt{'h'} and print_usage(); $opt{'i'} and $interval = $opt{'i'}; $opt{'c'} and $count = $opt{'c'}; $opt{'d'} and $dev = $opt{'d'}; if(!defined($opt{'n'})){ import Term::ANSIColor ':constants'; } else{ *RESET = sub { }; *YELLOW = sub { }; *RED = sub { }; *GREEN = sub { }; *WHITE = sub { }; *BOLD = sub { }; } } ############################################################## # print help information ############################################################### sub print_usage{ print <