pax_global_header00006660000000000000000000000064146115066620014521gustar00rootroot0000000000000052 comment=c5929265f4c3e3f8adb0126c456424b70c9a546e dlm-4.3.0/000077500000000000000000000000001461150666200123015ustar00rootroot00000000000000dlm-4.3.0/.gitignore000066400000000000000000000000001461150666200142570ustar00rootroot00000000000000dlm-4.3.0/Makefile000066400000000000000000000043011461150666200137370ustar00rootroot00000000000000CURDIR = $(shell pwd) ARCH = $(shell uname -m) all install clean: %: set -e; for d in libdlm dlm_controld dlm_tool fence; do $(MAKE) -C $$d $@; done rm -rf dlm.spec dlm-*.tar.gz *rpm* noarch $(ARCH) # clean .version only if we are within a git repo otherwise # we cannot regenerated the .version information if [ -d ".git" ]; then rm -f .version; fi check: @echo $@ not supported distcheck: @echo $@ not supported # version file can only be generated within a .git checkout .PHONY: .version .version: clean include/version.cf if [ -d ".git" ]; then \ rm -f $@-t $@; \ relver="`git tag |grep ^dlm| sort -ur | head -n 1 | sed -e 's#dlm-##g'`"; \ numcomm="`git log dlm-$$relver..HEAD | grep ^commit | wc -l`"; \ alphatag="`git rev-parse --short HEAD`"; \ rpmdate="`date "+%a %b %d %Y"`"; \ echo relver=\"$$relver\" >> $@-t; \ echo numcomm=\"$$numcomm\" >> $@-t; \ echo alphatag=\"$$alphatag\" >> $@-t; \ echo rpmdate=\"$$rpmdate\" >> $@-t; \ chmod a-w $@-t; \ mv $@-t $@; \ rm -f $@-t*; \ fi dlm.spec: .version dlm.spec.in rm -f $@-t $@ . ./.version; \ cat $@.in | sed \ -e "s#@relver@#$$relver#g" \ -e "s#@rpmdate@#$$rpmdate#g" \ -e "s#@numcomm@#$$numcomm#g" \ -e "s#@alphatag@#$$alphatag#g" \ > $@-t; # if there is no .git directory, this is a release tarball # drop information about number of commits and alphatag # from spec file. if [ ! -d ".git" ]; then \ sed -i -e "s#%glo.*numcomm.*##g" $@-t; \ sed -i -e "s#%glo.*alphatag.*##g" $@-t; \ fi chmod a-w $@-t mv $@-t $@ rm -f $@-t* tarball: dlm.spec .version . ./.version; \ if [ ! -d ".git" ] || [ "$(RELEASE)" = 1 ]; then \ tarver=$$relver; \ else \ tarver="`echo $$relver.$$numcomm.$$alphatag`"; \ fi; \ touch dlm-$$tarver.tar.gz; \ tar -zcp \ --transform "s/^./dlm-$$relver/" \ -f dlm-$$tarver.tar.gz \ --exclude=dlm-$$tarver.tar.gz \ --exclude=.git \ . RPMBUILDOPTS = --define "_sourcedir $(CURDIR)" \ --define "_specdir $(CURDIR)" \ --define "_builddir $(CURDIR)" \ --define "_srcrpmdir $(CURDIR)" \ --define "_rpmdir $(CURDIR)" srpm: tarball dlm.spec rpmbuild $(RPMBUILDOPTS) --nodeps -bs dlm.spec rpm: tarball dlm.spec rpmbuild $(RPMBUILDOPTS) -ba dlm.spec dlm-4.3.0/README.license000066400000000000000000000007001461150666200145770ustar00rootroot00000000000000LGPLv2+ libdlm/libdlm.c libdlm/libdlm.h libdlm/libdlm_internal.h GPLv2 dlm_controld/list.h (copied from linux kernel) dlm_controld/linux_helpers.h (various GPLv2 functions copied from linux kernel) GPLv2+ dlm_controld/rbtree.c (copied from linux kernel) dlm_controld/rbtree.h (copied from linux kernel) dlm_controld/rbtree_augmented.h (copied from linux kernel) dlm_controld/rbtree_types.h (copied from linux kernel) all other original files dlm-4.3.0/README.rst000066400000000000000000000002321461150666200137650ustar00rootroot00000000000000| ``See https://pagure.io/dlm`` | ``See dlm_controld(8) at dlm.git/dlm_controld/dlm_controld.8`` | ``See dlm.conf(5) at dlm.git/dlm_controld/dlm.conf.5`` dlm-4.3.0/dlm.spec.in000066400000000000000000000051351461150666200143420ustar00rootroot00000000000000%global alphatag @alphatag@ %global numcomm @numcomm@ Name: dlm Version: @relver@ Release: 1%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dist} License: GPLv2 and GPLv2+ and LGPLv2+ # For a breakdown of the licensing, see README.license Summary: dlm control daemon and tool URL: https://pagure.io/dlm BuildRequires: gcc %if 0%{?suse_version} BuildRequires: linux-kernel-headers %else BuildRequires: glibc-kernheaders %endif BuildRequires: corosynclib-devel >= 3.1.0 %if 0%{?suse_version} %if 0%{?suse_version} > 1500 BuildRequires: libpacemaker3-devel >= 1.1.7 %else BuildRequires: libpacemaker-devel >= 1.1.7 %endif %else BuildRequires: pacemaker-libs-devel >= 1.1.7 %endif BuildRequires: systemd-devel BuildRequires: make Source0: https://releases.pagure.org/dlm/%{name}-%{version}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}.tar.gz Requires: %{name}-lib = %{version}-%{release} Requires: corosync >= 3.1.0 %{?fedora:Requires: kernel-modules-extra} %{?suse_version:Requires: dlm-kmp} %if %{defined systemd_requires} %systemd_requires %endif %description The kernel dlm requires a user daemon to control membership. %prep %setup -q %build # upstream does not require configure # upstream does not support parallel builds %set_build_flags %make_build -j1 %install %make_install LIBDIR=%{_libdir} install -Dm 0644 init/dlm.service %{buildroot}%{_unitdir}/dlm.service install -Dm 0644 init/dlm.sysconfig %{buildroot}/etc/sysconfig/dlm %post %systemd_post dlm.service %preun %systemd_preun dlm.service %postun %systemd_postun_with_restart dlm.service %files %doc README.license %{_unitdir}/dlm.service %{_sbindir}/dlm_controld %{_sbindir}/dlm_tool %{_sbindir}/dlm_stonith %{_mandir}/man8/dlm* %{_mandir}/man5/dlm* %{_mandir}/man3/*dlm* %config(noreplace) %{_sysconfdir}/sysconfig/dlm %package lib Summary: Library for %{name} %description lib The %{name}-lib package contains the libraries needed to use the dlm from userland applications. %ldconfig_scriptlets lib %files lib /usr/lib/udev/rules.d/*-dlm.rules %{_libdir}/libdlm*.so.* %package devel Summary: Development files for %{name} Requires: %{name}-lib = %{version}-%{release} %description devel The %{name}-devel package contains libraries and header files for developing applications that use %{name}. %files devel %{_libdir}/libdlm*.so %{_includedir}/libdlm*.h %{_libdir}/pkgconfig/*.pc %changelog * @rpmdate@ Autogenerated version - @relver@-1%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}} - Autogenerated spec file dlm-4.3.0/dlm_controld/000077500000000000000000000000001461150666200147615ustar00rootroot00000000000000dlm-4.3.0/dlm_controld/Makefile000066400000000000000000000076271461150666200164350ustar00rootroot00000000000000DESTDIR= PREFIX=/usr LIBNUM=/lib64 BINDIR=$(PREFIX)/sbin LIBDIR=$(PREFIX)/$(LIBNUM) HDRDIR=$(PREFIX)/include MANDIR=$(PREFIX)/share/man PKGDIR=$(LIBDIR)/pkgconfig USE_SD_NOTIFY=yes BIN_TARGET = dlm_controld LIB_NAME = libdlmcontrol LIB_MAJOR = 3 LIB_MINOR = 2 LIB_SO = $(LIB_NAME).so LIB_SMAJOR = $(LIB_SO).$(LIB_MAJOR) LIB_TARGET = $(LIB_SO).$(LIB_MAJOR).$(LIB_MINOR) LIB_PC = $(LIB_NAME).pc LIB_PCIN = $(LIB_NAME).pc.in BIN_SOURCE = action.c \ cpg.c \ daemon_cpg.c \ helper.c \ crc.c \ fence_config.c \ fence.c \ main.c \ plock.c \ config.c \ member.c \ logging.c \ rbtree.c \ node_config.c LIB_SOURCE = lib.c CFLAGS += -D_GNU_SOURCE -O2 -ggdb \ -Wall -Wformat -Wformat-security -Wmissing-prototypes -Wnested-externs \ -Wextra -Wshadow -Wcast-align -Wwrite-strings \ -Waggregate-return -Wstrict-prototypes -Winline -Wredundant-decls \ -Wno-sign-compare -Wno-unused-parameter -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions -fasynchronous-unwind-tables -fdiagnostics-show-option \ -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong \ -fstack-clash-protection BIN_CFLAGS += $(CFLAGS) -fPIE -DPIE BIN_CFLAGS += -I../include -I../libdlm LIB_CFLAGS += $(CFLAGS) -fPIC # Temporary disable annobin plugin #LIB_CFLAGS += -fplugin=annobin BIN_LDFLAGS += $(LDFLAGS) -Wl,-z,relro -Wl,-z,now -pie BIN_LDFLAGS += -lpthread -lrt -luuid LIB_LDFLAGS += $(LDFLAGS) -Wl,-z,relro -Wl,-z,now -pie PKG_CONFIG ?= pkg-config PKG_CONFIG_FLAGS := --errors-to-stdout ifeq ($(USE_SD_NOTIFY),yes) SYSTEMD_CFLAGS := $(shell $(PKG_CONFIG) $(PKG_CONFIG_FLAGS) --cflags libsystemd) \ -DUSE_SD_NOTIFY SYSTEMD_CFLAGS_STATUS := $(.SHELLSTATUS) SYSTEMD_LDFLAGS := $(shell $(PKG_CONFIG) $(PKG_CONFIG_FLAGS) --libs libsystemd) SYSTEMD_LDFLAGS_STATUS := $(.SHELLSTATUS) else SYSTEMD_CFLAGS_STATUS := 0 SYSTEMD_LDFLAGS_STATUS := 0 endif CPG_CFLAGS := $(shell $(PKG_CONFIG) $(PKG_CONFIG_FLAGS) --cflags libcpg libcmap libcfg "libquorum >= 3.1.0") CPG_CFLAGS_STATUS := $(.SHELLSTATUS) CPG_LDFLAGS += $(shell $(PKG_CONFIG) $(PKG_CONFIG_FLAGS) --libs libcpg libcmap libcfg "libquorum >= 3.1.0") CPG_LDFLAGS_STATUS := $(.SHELLSTATUS) all: $(LIB_TARGET) $(BIN_TARGET) $(LIB_PC) $(BIN_TARGET): $(BIN_SOURCE) ifneq ($(CPG_CFLAGS_STATUS),0) $(error "Failed to call pkg-config for corosync cflags: $(CPG_CFLAGS)") endif ifneq ($(CPG_LDFLAGS_STATUS),0) $(error "Failed to call pkg-config for corosync ldflags: $(CPG_LDFLAGS)") endif ifneq ($(SYSTEMD_CFLAGS_STATUS),0) $(error "Failed to call pkg-config for systemd cflags: $(SYSTEMD_CFLAGS)") endif ifneq ($(SYSTEMD_LDFLAGS_STATUS),0) $(error "Failed to call pkg-config for systemd ldflags: $(SYSTEMD_LDFLAGS)") endif $(CC) $(BIN_SOURCE) $(BIN_CFLAGS) $(CPG_CFLAGS) $(SYSTEMD_CFLAGS) $(BIN_LDFLAGS) $(CPG_LDFLAGS) $(SYSTEMD_LDFLAGS) -o $@ -L. $(LIB_TARGET): $(LIB_SOURCE) $(CC) $^ $(LIB_CFLAGS) $(LIB_LDFLAGS) -shared -o $@ -Wl,-soname=$(LIB_SMAJOR) ln -sf $(LIB_TARGET) $(LIB_SO) ln -sf $(LIB_TARGET) $(LIB_SMAJOR) $(LIB_PC): $(LIB_PCIN) cat $(LIB_PCIN) | sed -e 's#@PREFIX@#$(PREFIX)#g;s#@LIBDIR@#$(LIBDIR)#g' > $@ clean: rm -f *.o *.so *.so.* $(BIN_TARGET) $(LIB_TARGET) *.pc INSTALL=$(shell which install) .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(LIBDIR) $(INSTALL) -d $(DESTDIR)/$(HDRDIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man5 $(INSTALL) -d $(DESTDIR)/$(PKGDIR) $(INSTALL) -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -m 755 $(LIB_TARGET) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_SO) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_SMAJOR) $(DESTDIR)/$(LIBDIR) $(INSTALL) -m 644 $(LIB_PC) $(DESTDIR)/$(PKGDIR) $(INSTALL) -m 644 libdlmcontrol.h $(DESTDIR)/$(HDRDIR) $(INSTALL) -m 644 dlm_controld.8 $(DESTDIR)/$(MANDIR)/man8/ $(INSTALL) -m 644 dlm.conf.5 $(DESTDIR)/$(MANDIR)/man5/ dlm-4.3.0/dlm_controld/action.c000066400000000000000000000477301461150666200164150ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #include #include static int dir_members[MAX_NODES]; static int dir_members_count; static int comms_nodes[MAX_NODES]; static int comms_nodes_count; #define DLM_SYSFS_DIR "/sys/kernel/dlm" #define CLUSTER_DIR "/sys/kernel/config/dlm/cluster" #define SPACES_DIR "/sys/kernel/config/dlm/cluster/spaces" #define COMMS_DIR "/sys/kernel/config/dlm/cluster/comms" static int detect_protocol(void) { cmap_handle_t handle; char *str = NULL; int rv, proto = -1; rv = cmap_initialize(&handle); if (rv != CS_OK) { log_error("cmap_initialize error %d", rv); return -1; } rv = cmap_get_string(handle, "totem.rrp_mode", &str); if (rv != CS_OK) goto out; log_debug("cmap totem.rrp_mode = '%s'", str); if (!strcmp(str, "none")) proto = PROTO_TCP; else proto = PROTO_SCTP; out: if (str) free(str); cmap_finalize(handle); return proto; } static int detect_cluster_name(void) { cmap_handle_t handle; char *str = NULL; int rv, err = -1; rv = cmap_initialize(&handle); if (rv != CS_OK) { log_error("cmap_initialize error %d", rv); return -1; } rv = cmap_get_string(handle, "totem.cluster_name", &str); if (rv != CS_OK) { log_error("cmap_get_string totem.cluster_name error %d", rv); goto out; } else err = 0; log_debug("cmap totem.cluster_name = '%s'", str); strncpy(cluster_name, str, DLM_LOCKSPACE_LEN); out: if (str) free(str); cmap_finalize(handle); return err; } /* This is for the case where dlm_controld exits/fails, abandoning dlm lockspaces in the kernel, and then dlm_controld is restarted. When dlm_controld exits and abandons lockspaces, that node needs to be rebooted to clear the uncontrolled lockspaces from the kernel. */ int check_uncontrolled_lockspaces(void) { DIR *d; struct dirent *de; int count = 0; d = opendir(DLM_SYSFS_DIR); if (!d) return 0; while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; log_error("found uncontrolled lockspace %s", de->d_name); count++; } closedir(d); if (count) { kick_node_from_cluster(our_nodeid); return -1; } return 0; } static int do_sysfs(const char *name, const char *file, char *val) { char fname[512]; int rv, fd; sprintf(fname, "%s/%s/%s", DLM_SYSFS_DIR, name, file); fd = open(fname, O_WRONLY); if (fd < 0) { log_error("open \"%s\" error %d %d", fname, fd, errno); return -1; } log_debug("write \"%s\" to \"%s\"", val, fname); rv = do_write(fd, val, strlen(val) + 1); close(fd); return rv; } int set_sysfs_control(char *name, int val) { char buf[32]; memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", val); return do_sysfs(name, "control", buf); } int set_sysfs_event_done(char *name, int val) { char buf[32]; memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", val); return do_sysfs(name, "event_done", buf); } int set_sysfs_id(char *name, uint32_t id) { char buf[32]; memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%u", id); return do_sysfs(name, "id", buf); } int set_sysfs_nodir(char *name, int val) { char buf[32]; memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", val); return do_sysfs(name, "nodir", buf); } static int update_dir_members(char *name) { char path[PATH_MAX]; DIR *d; struct dirent *de; int i = 0; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes", SPACES_DIR, name); d = opendir(path); if (!d) { log_debug("%s: opendir failed: %d", path, errno); return -1; } memset(dir_members, 0, sizeof(dir_members)); dir_members_count = 0; /* FIXME: we should probably read the nodeid in each dir instead */ while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; dir_members[i++] = atoi(de->d_name); log_debug("dir_member %d", dir_members[i-1]); } closedir(d); dir_members_count = i; return 0; } static int id_exists(int id, int count, int *array) { int i; for (i = 0; i < count; i++) { if (array[i] == id) return 1; } return 0; } static int create_path(const char *path) { mode_t old_umask; int rv; old_umask = umask(0022); rv = mkdir(path, 0777); if (rv < 0 && errno == EEXIST) rv = 0; if (rv < 0) log_error("%s: mkdir failed: %d", path, errno); umask(old_umask); return rv; } int path_exists(const char *path) { struct stat buf; if (stat(path, &buf) < 0) { if (errno != ENOENT) log_error("%s: stat failed: %d", path, errno); return 0; } return 1; } /* The "renew" nodes are those that have left and rejoined since the last call to set_members(). We rmdir/mkdir for these nodes so dlm-kernel can notice they've left and rejoined. */ int set_configfs_members(struct lockspace *ls, char *name, int new_count, int *new_members, int renew_count, int *renew_members) { char path[PATH_MAX]; char buf[32]; int i, w, fd, rv, id, old_count, *old_members; int do_renew; /* * create lockspace dir if it doesn't exist yet */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s", SPACES_DIR, name); if (!path_exists(path)) { if (create_path(path)) return -1; } /* * remove/add lockspace members */ rv = update_dir_members(name); if (rv) return rv; old_members = dir_members; old_count = dir_members_count; for (i = 0; i < old_count; i++) { id = old_members[i]; if (id_exists(id, new_count, new_members)) continue; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes/%d", SPACES_DIR, name, id); log_debug("set_members rmdir \"%s\"", path); rv = rmdir(path); if (rv) { log_error("%s: rmdir failed: %d", path, errno); goto out; } } /* * remove lockspace dir after we've removed all the nodes * (when we're shutting down and adding no new nodes) */ if (!new_count) { memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s", SPACES_DIR, name); log_debug("set_members lockspace rmdir \"%s\"", path); rv = rmdir(path); if (rv) log_error("%s: rmdir failed: %d", path, errno); } for (i = 0; i < new_count; i++) { id = new_members[i]; do_renew = 0; if (id_exists(id, renew_count, renew_members)) do_renew = 1; else if (id_exists(id, old_count, old_members)) continue; if (!is_cluster_member(id)) update_cluster(); /* * create node's dir */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes/%d", SPACES_DIR, name, id); if (do_renew) { log_debug("set_members renew rmdir \"%s\"", path); rv = rmdir(path); if (rv) { log_error("%s: renew rmdir failed: %d", path, errno); /* don't quit here, there's a case where * this can happen, where a node identified * for renewal was not really added * previously */ } } log_debug("set_members mkdir \"%s\"", path); rv = create_path(path); if (rv) goto out; /* * set node's nodeid */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes/%d/nodeid", SPACES_DIR, name, id); rv = fd = open(path, O_WRONLY); if (rv < 0) { log_error("%s: open failed: %d", path, errno); goto out; } memset(buf, 0, 32); snprintf(buf, 32, "%d", id); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d, %s", path, errno, buf); close(fd); goto out; } close(fd); /* * set node's weight */ w = get_weight(ls, id); memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes/%d/weight", SPACES_DIR, name, id); rv = fd = open(path, O_WRONLY); if (rv < 0) { log_error("%s: open failed: %d", path, errno); goto out; } memset(buf, 0, 32); snprintf(buf, 32, "%d", w); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d, %s", path, errno, buf); close(fd); goto out; } close(fd); } rv = 0; out: return rv; } #if 0 char *str_ip(char *addr) { static char ip[256]; struct sockaddr_in *sin = (struct sockaddr_in *) addr; memset(ip, 0, sizeof(ip)); inet_ntop(AF_INET, &sin->sin_addr, ip, 256); return ip; } #endif static char *str_ip(char *addr) { static char str_ip_buf[INET6_ADDRSTRLEN]; struct sockaddr_storage *ss = (struct sockaddr_storage *)addr; struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; void *saddr; if (ss->ss_family == AF_INET6) saddr = &sin6->sin6_addr; else saddr = &sin->sin_addr; inet_ntop(ss->ss_family, saddr, str_ip_buf, sizeof(str_ip_buf)); return str_ip_buf; } /* record the nodeids that are currently listed under config/dlm/cluster/comms/ so that we can remove all of them */ static int update_comms_nodes(void) { char path[PATH_MAX]; DIR *d; struct dirent *de; int i = 0; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, COMMS_DIR); d = opendir(path); if (!d) { log_debug("%s: opendir failed: %d", path, errno); return -1; } memset(comms_nodes, 0, sizeof(comms_nodes)); comms_nodes_count = 0; while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; comms_nodes[i++] = atoi(de->d_name); } closedir(d); comms_nodes_count = i; return 0; } /* clear out everything under config/dlm/cluster/comms/ */ static void clear_configfs_comms(void) { char path[PATH_MAX]; int i, rv; rv = update_comms_nodes(); if (rv < 0) return; for (i = 0; i < comms_nodes_count; i++) { memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d", COMMS_DIR, comms_nodes[i]); log_debug("clear_configfs_nodes rmdir \"%s\"", path); rv = rmdir(path); if (rv) log_error("%s: rmdir failed: %d", path, errno); } } static void clear_configfs_space_nodes(char *name) { char path[PATH_MAX]; int i, rv; rv = update_dir_members(name); if (rv < 0) return; for (i = 0; i < dir_members_count; i++) { memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s/nodes/%d", SPACES_DIR, name, dir_members[i]); log_debug("clear_configfs_space_nodes rmdir \"%s\"", path); rv = rmdir(path); if (rv) log_error("%s: rmdir failed: %d", path, errno); } } /* clear out everything under config/dlm/cluster/spaces/ */ static void clear_configfs_spaces(void) { char path[PATH_MAX]; DIR *d; struct dirent *de; int rv; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s", SPACES_DIR); d = opendir(path); if (!d) { log_debug("%s: opendir failed: %d", path, errno); return; } while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; clear_configfs_space_nodes(de->d_name); memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s", SPACES_DIR, de->d_name); log_debug("clear_configfs_spaces rmdir \"%s\"", path); rv = rmdir(path); if (rv) log_error("%s: rmdir failed: %d", path, errno); } closedir(d); } static int add_configfs_base(void) { int rv = 0; if (!path_exists("/sys/kernel/config")) { log_error("No /sys/kernel/config, is configfs loaded?"); return -1; } if (!path_exists("/sys/kernel/config/dlm")) { log_error("No /sys/kernel/config/dlm, is the dlm loaded?"); return -1; } if (!path_exists("/sys/kernel/config/dlm/cluster")) rv = create_path("/sys/kernel/config/dlm/cluster"); return rv; } int add_configfs_node(int nodeid, char *addr, int addrlen, int local, uint32_t mark) { char path[PATH_MAX]; char padded_addr[sizeof(struct sockaddr_storage)]; char buf[32]; int rv, fd; log_debug("set_configfs_node %d %s local %d mark %" PRIu32, nodeid, str_ip(addr), local, mark); /* * create comm dir for this node */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d", COMMS_DIR, nodeid); rv = create_path(path); if (rv) return -1; /* * set the nodeid */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d/nodeid", COMMS_DIR, nodeid); fd = open(path, O_WRONLY); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return -1; } memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", nodeid); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d, %s", path, errno, buf); close(fd); return -1; } close(fd); /* * set the address */ memset(padded_addr, 0, sizeof(padded_addr)); memcpy(padded_addr, addr, addrlen); memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d/addr", COMMS_DIR, nodeid); fd = open(path, O_WRONLY); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return -1; } rv = do_write(fd, padded_addr, sizeof(struct sockaddr_storage)); if (rv < 0) { log_error("%s: write failed: %d %d", path, errno, rv); close(fd); return -1; } close(fd); /* * set skb mark for nodeid * * If open() fails we skip it because kernel doesn't support it. * It's not a required confiuration. It will show up in the log. */ memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d/mark", COMMS_DIR, nodeid); fd = open(path, O_WRONLY); if (fd < 0) { log_error("%s: open failed: %d", path, errno); goto skip_non_required; } memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%" PRIu32, mark); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d, %s", path, errno, buf); close(fd); return -1; } close(fd); skip_non_required: /* * set local */ if (!local) goto out; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d/local", COMMS_DIR, nodeid); fd = open(path, O_WRONLY); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return -1; } rv = do_write(fd, (void *)"1", strlen("1")); if (rv < 0) { log_error("%s: write failed: %d", path, errno); close(fd); return -1; } close(fd); out: return 0; } void del_configfs_node(int nodeid) { char path[PATH_MAX]; int rv; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%d", COMMS_DIR, nodeid); log_debug("del_configfs_node rmdir \"%s\"", path); rv = rmdir(path); if (rv) log_error("%s: rmdir failed: %d", path, errno); } /* num may be 0, str won't be NULL */ static int set_configfs_cluster(const char *name, char *str, int num) { char path[PATH_MAX]; char buf[32]; char *wbuf; int fd, rv; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "%s/%s", CLUSTER_DIR, name); fd = open(path, O_WRONLY); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return fd; } if (str) { wbuf = str; } else { memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", num); wbuf = buf; } rv = do_write(fd, wbuf, strlen(wbuf)); if (rv < 0) { log_error("%s: write failed: %d", path, errno); return rv; } close(fd); log_debug("set %s %s", name, wbuf); return 0; } int set_configfs_opt(const char *name, char *str, int num) { return set_configfs_cluster(name, str, num); } #define NET_RMEM_DEFAULT 4194304 #define NET_RMEM_MAX 4194304 static int set_proc_rmem(void) { char path[PATH_MAX]; char buf[32]; int fd, rv; memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "/proc/sys/net/core/rmem_default"); fd = open(path, O_RDWR); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return fd; } memset(buf, 0, sizeof(buf)); rv = read(fd, buf, sizeof(buf)); if (rv < 0) { log_error("%s: read failed: %d", path, errno); close(fd); return rv; } if (atoi(buf) >= NET_RMEM_DEFAULT) { close(fd); goto next; } memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", NET_RMEM_DEFAULT); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d", path, errno); close(fd); return rv; } close(fd); log_debug("set %s %s", path, buf); next: memset(path, 0, PATH_MAX); snprintf(path, PATH_MAX, "/proc/sys/net/core/rmem_max"); fd = open(path, O_RDWR); if (fd < 0) { log_error("%s: open failed: %d", path, errno); return fd; } memset(buf, 0, sizeof(buf)); rv = read(fd, buf, sizeof(buf)); if (rv < 0) { log_error("%s: read failed: %d", path, errno); close(fd); return rv; } if (atoi(buf) >= NET_RMEM_MAX) { close(fd); goto out; } memset(buf, 0, sizeof(buf)); snprintf(buf, 32, "%d", NET_RMEM_MAX); rv = do_write(fd, buf, strlen(buf)); if (rv < 0) { log_error("%s: write failed: %d", path, errno); close(fd); return rv; } close(fd); log_debug("set %s %s", path, buf); out: return 0; } void clear_configfs(void) { clear_configfs_comms(); clear_configfs_spaces(); rmdir("/sys/kernel/config/dlm/cluster"); } int setup_configfs_options(void) { char *proto_name; int rv, proto_num; clear_configfs(); rv = add_configfs_base(); if (rv < 0) return rv; /* the kernel has its own defaults for these values which we don't want to change unless these have been set explicitly on cli or config file */ if (dlm_options[log_debug_ind].cli_set || dlm_options[log_debug_ind].file_set) set_configfs_cluster("log_debug", NULL, opt(log_debug_ind)); if (dlm_options[port_ind].cli_set || dlm_options[port_ind].file_set) set_configfs_cluster("tcp_port", NULL, optu(port_ind)); set_configfs_cluster("mark", NULL, optu(mark_ind)); proto_name = opts(protocol_ind); proto_num = -1; if (!strcasecmp(proto_name, "detect") || !strcmp(proto_name, "2")) proto_num = detect_protocol(); /* may be -1 */ else if (!strcasecmp(proto_name, "tcp") || !strcmp(proto_name, "0")) proto_num = PROTO_TCP; else if (!strcasecmp(proto_name, "sctp") || !strcmp(proto_name, "1")) proto_num = PROTO_SCTP; if (proto_num == PROTO_TCP || proto_num == PROTO_SCTP) set_configfs_cluster("protocol", NULL, proto_num); if (proto_num == PROTO_SCTP) set_proc_rmem(); /* * set clustername, recover_callbacks * * we can't set our nodeid here, though, it must be set *after* * setup_monitor, because the kernel assumes if the nodeid * is set, but monitor is not opened, that it's an old, * pre-monitor version of dlm_controld and allows it to * go ahead without the monitor being open */ if (opt(enable_fscontrol_ind)) { /* deprecated */ set_configfs_cluster("recover_callbacks", NULL, 0); } else { set_configfs_cluster("recover_callbacks", NULL, 1); detect_cluster_name(); if (cluster_name[0]) { set_configfs_cluster("cluster_name", cluster_name, 0); } else { log_error("no cluster name"); return -1; } } return 0; } /* see comment above re why setup_monitor needs to come between setup_configfs_options and setup_configfs_members */ int setup_configfs_members(void) { /* add configfs entries for existing nodes */ update_cluster(); return 0; } static void find_minors(void) { FILE *fl; char name[256]; uint32_t number; int found = 0; int c; control_minor = 0; monitor_minor = 0; plock_minor = 0; if (!(fl = fopen("/proc/misc", "r"))) { log_error("/proc/misc fopen failed: %s", strerror(errno)); return; } while (!feof(fl)) { if (fscanf(fl, "%d %255s\n", &number, &name[0]) == 2) { if (!strcmp(name, "dlm-control")) { control_minor = number; found++; } else if (!strcmp(name, "dlm-monitor")) { monitor_minor = number; found++; } else if (!strcmp(name, "dlm_plock")) { plock_minor = number; found++; } } else do { c = fgetc(fl); } while (c != EOF && c != '\n'); if (found == 3) break; } fclose(fl); if (!found) log_error("Is dlm missing from kernel? No misc devices found."); } static int find_udev_device(const char *path, uint32_t minor) { struct stat st; int i; for (i = 0; i < 10; i++) { if (stat(path, &st) == 0 && minor(st.st_rdev) == minor) return 0; sleep(1); } log_error("cannot find device %s with minor %d", path, minor); return -1; } int setup_misc_devices(void) { int rv; find_minors(); if (control_minor) { rv = find_udev_device("/dev/misc/dlm-control", control_minor); if (rv < 0) return rv; log_debug("found /dev/misc/dlm-control minor %u", control_minor); } if (monitor_minor) { rv = find_udev_device("/dev/misc/dlm-monitor", monitor_minor); if (rv < 0) return rv; log_debug("found /dev/misc/dlm-monitor minor %u", monitor_minor); } if (plock_minor) { rv = find_udev_device("/dev/misc/dlm_plock", plock_minor); if (rv < 0) return rv; log_debug("found /dev/misc/dlm_plock minor %u", plock_minor); } return 0; } dlm-4.3.0/dlm_controld/config.c000066400000000000000000000252471461150666200164040ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #if 0 lockspace ls_name [ls_args] master ls_name node=nodeid [node_args] master ls_name node=nodeid [node_args] master ls_name node=nodeid [node_args] lockspace foo nodir=1 master node=1 weight=2 master node=2 weight=1 #endif /* The max line length in dlm.conf */ #define MAX_LINE 256 int get_weight(struct lockspace *ls, int nodeid) { int i; /* if no masters are defined, everyone defaults to weight 1 */ if (!ls->master_count) return 1; for (i = 0; i < ls->master_count; i++) { if (ls->master_nodeid[i] == nodeid) return ls->master_weight[i]; } /* if masters are defined, non-masters default to weight 0 */ return 0; } static void read_master_config(struct lockspace *ls, FILE *file) { char line[MAX_LINE]; char name[MAX_LINE]; char args[MAX_LINE]; char *k; int nodeid, weight, i; while (fgets(line, MAX_LINE, file)) { if (line[0] == '\n') break; if (line[0] == ' ') break; if (line[0] == '#') continue; if (strncmp(line, "master", strlen("master"))) break; memset(name, 0, sizeof(name)); memset(args, 0, sizeof(args)); nodeid = 0; weight = 1; sscanf(line, "master %s %[^\n]s", name, args); if (strcmp(name, ls->name)) break; k = strstr(args, "node="); if (!k) break; sscanf(k, "node=%d", &nodeid); if (!nodeid) break; k = strstr(args, "weight="); if (k) sscanf(k, "weight=%d", &weight); log_debug("config lockspace %s nodeid %d weight %d", ls->name, nodeid, weight); i = ls->master_count++; ls->master_nodeid[i] = nodeid; ls->master_weight[i] = weight; if (ls->master_count >= MAX_NODES) break; } } void setup_lockspace_config(struct lockspace *ls) { FILE *file; char line[MAX_LINE]; char name[MAX_LINE]; char args[MAX_LINE]; char *k; int val; if (!path_exists(CONF_FILE_PATH)) return; file = fopen(CONF_FILE_PATH, "r"); if (!file) return; while (fgets(line, MAX_LINE, file)) { if (line[0] == '#') continue; if (line[0] == '\n') continue; if (strncmp(line, "lockspace", strlen("lockspace"))) continue; memset(name, 0, sizeof(name)); memset(args, 0, sizeof(args)); val = 0; sscanf(line, "lockspace %s %[^\n]s", name, args); if (strcmp(name, ls->name)) continue; k = strstr(args, "nodir="); if (k) { sscanf(k, "nodir=%d", &val); ls->nodir = val; } read_master_config(ls, file); } fclose(file); } static void get_val_int(char *line, int *val_out) { char key[MAX_LINE]; char val[MAX_LINE]; int rv; rv = sscanf(line, "%[^=]=%s", key, val); if (rv != 2) { log_error("Failed to parse config line %s", line); return; } *val_out = atoi(val); } static void get_val_uint(char *line, unsigned int *val_out) { char key[MAX_LINE]; char val[MAX_LINE]; int rv; rv = sscanf(line, "%[^=]=%s", key, val); if (rv != 2) { log_error("Failed to parse config line %s", line); return; } *val_out = strtoul(val, NULL, 0); } static void get_val_str(char *line, char *val_out) { char key[MAX_LINE]; char val[MAX_LINE]; int rv; rv = sscanf(line, "%[^=]=%s", key, val); if (rv != 2) { log_error("Failed to parse config line %s", line); return; } strcpy(val_out, val); } inline static void reload_setting(int index) { switch(index) { case log_debug_ind: set_configfs_opt("log_debug", NULL, opt(log_debug_ind)); break; case debug_logfile_ind: set_logfile_priority(); break; default: break; } } static void reset_opt_value(int index) { struct dlm_option *o = &dlm_options[index]; /* config priority: cli, config file, default */ if (o->cli_set) { o->use_int = o->cli_int; o->use_uint = o->cli_uint; o->use_str = o->cli_str; } else if (o->file_set) { o->use_int = o->file_int; o->use_uint = o->file_uint; o->use_str = o->file_str; } else { o->use_int = o->default_int; o->use_uint = o->default_uint; o->use_str = (char *)o->default_str; } /* * We don't handle reset value same as legacy value. * * i.e. * 1. option abc default value is 0, while in dlm.conf abc=0. * 2. Then remove abc from dlm.conf. * 3. This function still call reload_setting(), and won't bypass this * calling for no change. */ reload_setting(index); return; } void set_opt_file(int update) { unsigned int uval = 0; struct dlm_option *o; FILE *file; char line[MAX_LINE]; char str[MAX_LINE]; int i, val = 0, ind; char scanned_dlm_opt[dlm_options_max]; if (!path_exists(CONF_FILE_PATH)) return; file = fopen(CONF_FILE_PATH, "r"); if (!file) return; /* In update mode, there is a little bit bother if one option ever set * but later be removed or commented out */ memset(scanned_dlm_opt, 0, sizeof(scanned_dlm_opt)); scanned_dlm_opt[help_ind] = 1; scanned_dlm_opt[version_ind] = 1; while (fgets(line, MAX_LINE, file)) { if (line[0] == '#') continue; if (line[0] == '\n') continue; memset(str, 0, sizeof(str)); for (i = 0; i < MAX_LINE; i++) { if (line[i] == ' ') break; if (line[i] == '=') break; if (line[i] == '\0') break; if (line[i] == '\n') break; if (line[i] == '\t') break; str[i] = line[i]; } ind = get_ind_name(str); if (ind < 0) continue; o = &dlm_options[ind]; if (!o->name) continue; scanned_dlm_opt[ind] = 1; /* In update flow, bypass the item which doesn't support reload. */ if (update && !o->reload) continue; o->file_set++; if (!o->req_arg) { /* current only "help" & "version" are no_arg type, ignore them */ continue; } else if (o->req_arg == req_arg_int) { get_val_int(line, &val); if (update && (o->file_int == val)) continue; o->file_int = val; if (!o->cli_set) o->use_int = o->file_int; log_debug("config file %s = %d cli_set %d use %d", o->name, o->file_int, o->cli_set, o->use_int); } else if (o->req_arg == req_arg_uint) { get_val_uint(line, &uval); if (update && (o->file_uint == uval)) continue; o->file_uint = uval; if (!o->cli_set) o->use_uint = o->file_uint; log_debug("config file %s = %u cli_set %d use %u", o->name, o->file_uint, o->cli_set, o->use_uint); } else if (o->req_arg == req_arg_bool) { get_val_int(line, &val); val = val ? 1 : 0; if (update && (o->file_int == val)) continue; o->file_int = val; if (!o->cli_set) o->use_int = o->file_int; log_debug("config file %s = %d cli_set %d use %d", o->name, o->file_int, o->cli_set, o->use_int); } else if (o->req_arg == req_arg_str) { memset(str, 0, sizeof(str)); get_val_str(line, str); if (update && !strcmp(o->file_str, str)) continue; if (o->file_str) free(o->file_str); o->file_str = strdup(str); if (!o->cli_set) o->use_str = o->file_str; log_debug("config file %s = %s cli_set %d use %s", o->name, o->file_str, o->cli_set, o->use_str); } if (update) reload_setting(ind); } if (update) { /* handle commented out options */ for (i=0; ireload) return; o->dynamic_set = 0; o->dynamic_int = 0; if (o->dynamic_str){ free(o->dynamic_str); o->dynamic_str = NULL; } o->dynamic_uint = 0; reset_opt_value(index); return; } /* copy code from exec_command() */ void set_opt_online(char *cmd_str, int cmd_len) { int i, ind, val = 0; int av_count = 0; int arg_len; unsigned int uval = 0; struct dlm_option *o; char str[MAX_LINE]; char arg[ONE_ARG_LEN]; char *av[MAX_AV_COUNT + 1]; /* +1 for NULL */ if (cmd_len > RUN_COMMAND_LEN) return; for (i = 0; i < MAX_AV_COUNT + 1; i++) av[i] = NULL; if (!cmd_str[0]) return; /* this should already be done, but make sure */ cmd_str[cmd_len - 1] = '\0'; memset(&arg, 0, sizeof(arg)); arg_len = 0; cmd_len = strlen(cmd_str); for (i = 0; i < cmd_len; i++) { if (!cmd_str[i]) break; if (av_count == MAX_AV_COUNT) break; if (cmd_str[i] == '\\') { if (i == (cmd_len - 1)) break; i++; if (cmd_str[i] == '\\') { arg[arg_len++] = cmd_str[i]; continue; } if (isspace(cmd_str[i])) { arg[arg_len++] = cmd_str[i]; continue; } else { break; } } if (isalnum(cmd_str[i]) || ispunct(cmd_str[i])) { arg[arg_len++] = cmd_str[i]; } else if (isspace(cmd_str[i])) { if (arg_len) av[av_count++] = strdup(arg); memset(arg, 0, sizeof(arg)); arg_len = 0; } else { break; } } if ((av_count < MAX_AV_COUNT) && arg_len) { av[av_count++] = strdup(arg); } /* for (i = 0; i < MAX_AV_COUNT + 1; i++) { if (!av[i]) break; syslog(LOG_ERR, "command av[%d] \"%s\"", i, av[i]); } */ if (!strcmp(av[0], "restore_all")) { for (i = 0; i < dlm_options_max; i++) reset_dynamic(i); return; } for (i = 0; i < av_count; i++) { ind = get_ind_name(av[i]); if (ind < 0) continue; o = &dlm_options[ind]; if (!o || !o->reload) continue; get_val_str(av[i], str); if (!strcmp(str, "restore")) { reset_dynamic(ind); continue; } o->dynamic_set++; if (!o->req_arg || o->req_arg == req_arg_int) { get_val_int(av[i], &val); if (!o->req_arg) val = val ? 1 : 0; o->dynamic_int = val; log_debug("config dynamic %s = %d previous use %d", o->name, o->dynamic_int, o->use_int); o->use_int = o->dynamic_int; } else if (o->req_arg == req_arg_uint) { get_val_uint(av[i], &uval); o->dynamic_uint = uval; log_debug("config dynamic %s = %u previous use %u", o->name, o->dynamic_uint, o->use_uint); o->use_uint = o->dynamic_uint; } else if (o->req_arg == req_arg_bool) { get_val_int(av[i], &val); o->dynamic_int = val ? 1 : 0; log_debug("config dynamic %s = %d previous use %d", o->name, o->dynamic_int, o->use_int); o->use_int = o->dynamic_int; } else if (o->req_arg == req_arg_str) { memset(str, 0, sizeof(str)); get_val_str(av[i], str); o->dynamic_str = strdup(str); log_debug("config dynamic %s = %s previous use %s", o->name, o->dynamic_str, o->use_str); o->use_str = o->dynamic_str; } reload_setting(ind); } return; } dlm-4.3.0/dlm_controld/cpg.c000066400000000000000000001472611461150666200157110ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ /* * . application in the kernel or userspace asks to join * a lockspace * * . dlm_new_lockspace() in the kernel sets up basic * lockspace structures, then sends a uevent to * dlm_controld in userspace (KOBJ_ONLINE), asking * dlm_controld to join the corosync group ("cpg") * for the lockspace. dlm_new_lockspace() blocks * waiting for a reply from dlm_controld (the reply is * a write to the "event_done" sysfs file). * When the successful reply comes back to dlm-kernel * from dlm_controld, dlm-kernel knows it is now a member * of the lockspace membership (represented in userspace * by the corosync cpg), and can do locking with the * other members. Before sending event_done to the kernel, * dlm_controld tells dlm-kernel who the other lockspace * members are via configfs settings. * * . When dlm_controld gets a request from dlm-kernel to * join a lockspace, it runs dlm_join_lockspace() which * calls cpg_join() to join the corosync group representing * the members of the lockspace. dlm_controld will get * callbacks from corosync when membership of this cpg * changes (joins/leaves/failures). After calling * cpg_join(), dlm_controld waits for the first * corosync membership callback indicating it is now * a member of the cpg. The callback function for * cpg membership changes is confchg_cb(). Corosync * guarantees that all members of the cpg see the * same sequence of confchg callbacks (e.g. if a number * of nodes are joining/leaving/failing at once). * When the first confchg arrives after cpg_join(), * dlm_controld sets up the current members for dlm-kernel * via configfs, then writes to event_done in sysfs to start * dlm-kernel running. * * . When a cpg member joins/leaves/fails, dlm_controld * on all current members gets a confchg callback * showing the new members. dlm_controld then stops * dlm-kernel locking activity for that lockspace by * writing 0 to the "control" sysfs file. * dlm_controld then sends/recvs cpg messages to all * other cpg members to act as barrier to ensure all * members have stopped locking activity in the kernel * (apply_changes()). When all have done this, * dlm_controld on all the members then sets up the * new members in the kernel (via configfs) and tells * dlm-kernel to start the lockspace again (start_kernel()). * * . When dlm-kernel is started after being stopped, it does * lockspace recovery based on changes to the membership. * When recovery is done, normal locking activity resumes. * * Replacing dlm_controld is a matter doing the following * steps by either manually setting up sysfs and configfs, * or having a new daemon to do it: * * - decide who the lockspace members are * - stop dlm-kernel before changing lockspace members (write to sysfs) * - wait for previous step on all before making changes * - tell dlm-kernel member nodeids/IPs in configfs (write to configfs) * - start dlm-kernel (write to sysfs) * * echo 0/1 into /sys/kernel/dlm/foo/control and /sys/kernel/dlm/foo/event_done * echo/mkdir/write values into /sys/kernel/config/dlm/cluster/comms/ and * /sys/kernel/config/dlm/cluster/spaces/foo/ */ #include "dlm_daemon.h" #define log_limit(ls, fmt, args...) ({ \ static uint32_t __change_nr; \ if (ls->change_seq > __change_nr) { \ __change_nr = ls->change_seq; \ log_group(ls, fmt, ##args); \ } \ }) /* retries are once a second */ #define log_retry(ls, fmt, args...) ({ \ if (ls->wait_retry < 60) \ log_group(ls, fmt, ##args); \ else if (ls->wait_retry == 60) \ log_erros(ls, fmt, ##args); \ else if (!(ls->wait_retry % 3600)) \ log_erros(ls, fmt, ##args); \ }) /* per lockspace cpg: ls->node_history */ struct node { struct list_head list; int nodeid; uint64_t lockspace_add_time; uint64_t lockspace_rem_time; uint64_t lockspace_fail_time; uint32_t lockspace_add_seq; uint32_t lockspace_rem_seq; uint32_t lockspace_fail_seq; int lockspace_member; int lockspace_fail_reason; uint32_t last_match_seq; uint64_t start_time; int check_fs; int fs_notified; int need_fencing; uint32_t fence_queries; /* for debug */ uint64_t fail_walltime; uint64_t fail_monotime; }; /* per lockspace confchg: ls->changes */ #define CGST_WAIT_CONDITIONS 1 #define CGST_WAIT_MESSAGES 2 struct change { struct list_head list; struct list_head members; struct list_head removed; /* nodes removed by this change */ int member_count; int joined_count; int remove_count; int failed_count; int state; int we_joined; uint32_t seq; /* used as a reference for debugging, and for queries */ uint32_t combined_seq; /* for queries */ uint64_t create_time; }; /* per lockspace change member: cg->members */ struct member { struct list_head list; int nodeid; int start; /* 1 if we received a start message for this change */ int added; /* 1 if added by this change */ int failed; /* 1 if failed in this change */ int disallowed; uint32_t start_flags; }; struct ls_info { uint32_t ls_info_size; uint32_t id_info_size; uint32_t id_info_count; uint32_t started_count; int member_count; int joined_count; int remove_count; int failed_count; }; struct id_info { int nodeid; }; static void ls_info_in(struct ls_info *li) { li->ls_info_size = le32_to_cpu(li->ls_info_size); li->id_info_size = le32_to_cpu(li->id_info_size); li->id_info_count = le32_to_cpu(li->id_info_count); li->started_count = le32_to_cpu(li->started_count); li->member_count = le32_to_cpu(li->member_count); li->joined_count = le32_to_cpu(li->joined_count); li->remove_count = le32_to_cpu(li->remove_count); li->failed_count = le32_to_cpu(li->failed_count); } static void id_info_in(struct id_info *id) { id->nodeid = le32_to_cpu(id->nodeid); } static void ids_in(struct ls_info *li, struct id_info *ids) { struct id_info *id; int i; id = ids; for (i = 0; i < li->id_info_count; i++) { id_info_in(id); id = (struct id_info *)((char *)id + li->id_info_size); } } static struct member *find_memb(struct change *cg, int nodeid) { struct member *memb; list_for_each_entry(memb, &cg->members, list) { if (memb->nodeid == nodeid) return memb; } return NULL; } static struct lockspace *find_ls_handle(cpg_handle_t h) { struct lockspace *ls; list_for_each_entry(ls, &lockspaces, list) { if (ls->cpg_handle == h) return ls; } return NULL; } static struct lockspace *find_ls_ci(int ci) { struct lockspace *ls; list_for_each_entry(ls, &lockspaces, list) { if (ls->cpg_client == ci) return ls; } return NULL; } static void free_cg(struct change *cg) { struct member *memb, *safe; list_for_each_entry_safe(memb, safe, &cg->members, list) { list_del(&memb->list); free(memb); } list_for_each_entry_safe(memb, safe, &cg->removed, list) { list_del(&memb->list); free(memb); } free(cg); } static void free_ls(struct lockspace *ls) { struct change *cg, *cg_safe; struct node *node, *node_safe; list_for_each_entry_safe(cg, cg_safe, &ls->changes, list) { list_del(&cg->list); free_cg(cg); } if (ls->started_change) free_cg(ls->started_change); list_for_each_entry_safe(node, node_safe, &ls->node_history, list) { list_del(&node->list); free(node); } free(ls); } /* Problem scenario: nodes A,B,C are in fence domain node C has gfs foo mounted node C fails nodes A,B begin fencing C (slow, not completed) node B mounts gfs foo We may end up having gfs foo mounted and being used on B before C has been fenced. C could wake up corrupt fs. So, we need to prevent any new gfs mounts while there are any outstanding, incomplete fencing operations. We also need to check that the specific failed nodes we know about have been fenced (since fenced may not even have been notified that the node has failed yet). So, check that: 1. has fenced fenced the node since we saw it fail? 2. fenced has no outstanding fencing ops For 1: - node X fails - we see node X fail and X has non-zero start_time, set need_fencing and record the fail time - wait for X to be removed from all dlm cpg's (probably not necessary) - check that the fencing time is later than the recorded time above Tracking fencing state when there are spurious partitions/merges... from a spurious leave/join of node X, a lockspace will see: - node X is a lockspace member - node X fails, may be waiting for all cpgs to see failure or for fencing to complete - node X joins the lockspace - we want to process the change as usual, but don't want to disrupt the code waiting for the fencing, and we want to continue running properly once the remerged node is properly reset ls->node_history when we see a node not in this list, add entry for it with zero start_time record the time we get a good start message from the node, start_time clear start_time if the node leaves if node fails with non-zero start_time, set need_fencing when a node is fenced, clear start_time and clear need_fencing if a node remerges after this, no good start message, no new start_time set if a node fails with zero start_time, it doesn't need fencing if a node remerges before it's been fenced, no good start message, no new start_time set */ static struct node *get_node_history(struct lockspace *ls, int nodeid) { struct node *node; list_for_each_entry(node, &ls->node_history, list) { if (node->nodeid == nodeid) return node; } return NULL; } static struct node *get_node_history_create(struct lockspace *ls, int nodeid) { struct node *node; node = get_node_history(ls, nodeid); if (node) return node; node = malloc(sizeof(struct node)); if (!node) return NULL; memset(node, 0, sizeof(struct node)); node->nodeid = nodeid; list_add_tail(&node->list, &ls->node_history); return node; } static void node_history_lockspace_add(struct lockspace *ls, int nodeid, struct change *cg, uint64_t now) { struct node *node; node = get_node_history_create(ls, nodeid); if (!node) { log_error("node_history_lockspace_add no nodeid %d", nodeid); return; } node->lockspace_add_time = now; node->lockspace_add_seq = cg->seq; node->lockspace_member = 1; } static void node_history_lockspace_left(struct lockspace *ls, int nodeid, struct change *cg, uint64_t now) { struct node *node; node = get_node_history(ls, nodeid); if (!node) { log_error("node_history_lockspace_left no nodeid %d", nodeid); return; } node->start_time = 0; node->lockspace_rem_time = now; node->lockspace_rem_seq = cg->seq; /* for queries */ node->lockspace_member = 0; } static void node_history_lockspace_fail(struct lockspace *ls, int nodeid, struct change *cg, int reason, uint64_t now) { struct node *node; node = get_node_history(ls, nodeid); if (!node) { log_error("node_history_lockspace_fail no nodeid %d", nodeid); return; } if (opt(enable_fencing_ind) && node->start_time) { node->need_fencing = 1; node->fence_queries = 0; } if (ls->fs_registered) { log_group(ls, "check_fs nodeid %d set", nodeid); node->check_fs = 1; } node->lockspace_rem_time = now; node->lockspace_rem_seq = cg->seq; /* for queries */ node->lockspace_member = 0; node->lockspace_fail_time = now; node->lockspace_fail_seq = node->lockspace_rem_seq; node->lockspace_fail_reason = reason; /* for queries */ node->fail_monotime = now; node->fail_walltime = time(NULL); } static void node_history_start(struct lockspace *ls, int nodeid) { struct node *node; node = get_node_history(ls, nodeid); if (!node) { log_error("node_history_start no nodeid %d", nodeid); return; } node->start_time = monotime(); } /* wait for cluster ringid and cpg ringid to be the same so we know our information from each service is based on the same node state */ static int check_ringid_done(struct lockspace *ls) { /* If we've received a confchg due to a nodedown, but not the corresponding ringid callback, then we should wait for the ringid callback. Once we have both conf and ring callbacks, we can compare cpg/quorum ringids. Otherwise, there's a possible problem if we receive a confchg before both ringid callback and quorum callback. Then we'd get through this function by comparing the old, matching ringids. (We seem to usually get the quorum callback before any cpg callbacks, in which case we wouldn't need cpg_ringid_wait, but that's probably not guaranteed.) */ if (ls->cpg_ringid_wait) { log_group(ls, "check_ringid wait cluster %llu cpg %u:%llu", (unsigned long long)cluster_ringid_seq, ls->cpg_ringid.nodeid, (unsigned long long)ls->cpg_ringid.seq); return 0; } if (cluster_ringid_seq != ls->cpg_ringid.seq) { log_group(ls, "check_ringid cluster %llu cpg %u:%llu", (unsigned long long)cluster_ringid_seq, ls->cpg_ringid.nodeid, (unsigned long long)ls->cpg_ringid.seq); return 0; } log_limit(ls, "check_ringid done cluster %llu cpg %u:%llu", (unsigned long long)cluster_ringid_seq, ls->cpg_ringid.nodeid, (unsigned long long)ls->cpg_ringid.seq); return 1; } static int check_fencing_done(struct lockspace *ls) { struct node *node; uint64_t fence_monotime; int wait_count = 0; int rv, in_progress; if (!opt(enable_fencing_ind)) { log_group(ls, "check_fencing disabled"); return 1; } list_for_each_entry(node, &ls->node_history, list) { if (!node->need_fencing) continue; rv = fence_node_time(node->nodeid, &fence_monotime); if (rv < 0) { log_error("fenced_node_time error %d", rv); continue; } if (fence_monotime >= node->fail_monotime) { log_group(ls, "check_fencing %d done start %llu fail %llu fence %llu", node->nodeid, (unsigned long long)node->start_time, (unsigned long long)node->fail_monotime, (unsigned long long)fence_monotime); node->need_fencing = 0; node->start_time = 0; continue; } else { if (!node->fence_queries) { log_group(ls, "check_fencing %d wait start %llu fail %llu", node->nodeid, (unsigned long long)node->start_time, (unsigned long long)node->fail_monotime); node->fence_queries++; } wait_count++; continue; } } if (wait_count) { log_limit(ls, "check_fencing wait_count %d", wait_count); return 0; } /* now check if there are any outstanding fencing ops (for nodes we may not have seen in any lockspace), and return 0 if there are any */ rv = fence_in_progress(&in_progress); if (rv < 0) { log_error("fenced_domain_info error %d", rv); return 0; } if (in_progress) { log_limit(ls, "check_fencing in progress %d", in_progress); return 0; } log_group(ls, "check_fencing done"); return 1; } /* wait for local fs_controld to ack each failed node */ static int check_fs_done(struct lockspace *ls) { struct node *node; int wait_count = 0; /* no corresponding fs for this lockspace */ if (!ls->fs_registered) return 1; list_for_each_entry(node, &ls->node_history, list) { if (!node->check_fs) continue; if (node->fs_notified) { log_group(ls, "check_fs nodeid %d clear", node->nodeid); node->check_fs = 0; node->fs_notified = 0; } else { log_group(ls, "check_fs nodeid %d needs fs notify", node->nodeid); wait_count++; } } if (wait_count) return 0; log_group(ls, "check_fs done"); return 1; } static int member_ids[MAX_NODES]; static int member_count; static int renew_ids[MAX_NODES]; static int renew_count; static void format_member_ids(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); struct member *memb; memset(member_ids, 0, sizeof(member_ids)); member_count = 0; list_for_each_entry(memb, &cg->members, list) member_ids[member_count++] = memb->nodeid; } /* list of nodeids that have left and rejoined since last start_kernel; is any member of startcg in the left list of any other cg's? (if it is, then it presumably must be flagged added in another) */ static void format_renew_ids(struct lockspace *ls) { struct change *cg, *startcg; struct member *memb, *leftmemb; startcg = list_first_entry(&ls->changes, struct change, list); memset(renew_ids, 0, sizeof(renew_ids)); renew_count = 0; list_for_each_entry(memb, &startcg->members, list) { list_for_each_entry(cg, &ls->changes, list) { if (cg == startcg) continue; list_for_each_entry(leftmemb, &cg->removed, list) { if (memb->nodeid == leftmemb->nodeid) { renew_ids[renew_count++] = memb->nodeid; } } } } } static void start_kernel(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); if (!ls->kernel_stopped) { log_error("start_kernel cg %u not stopped", cg->seq); return; } log_group(ls, "start_kernel cg %u member_count %d", cg->seq, cg->member_count); /* needs to happen before setting control which starts recovery */ if (ls->joining) set_sysfs_id(ls->name, ls->global_id); if (ls->nodir) set_sysfs_nodir(ls->name, 1); format_member_ids(ls); format_renew_ids(ls); set_configfs_members(ls, ls->name, member_count, member_ids, renew_count, renew_ids); set_sysfs_control(ls->name, 1); ls->kernel_stopped = 0; if (ls->joining) { set_sysfs_event_done(ls->name, 0); ls->joining = 0; } } void cpg_stop_kernel(struct lockspace *ls) { if (!ls->kernel_stopped) { log_group(ls, "%s", __func__); set_sysfs_control(ls->name, 0); ls->kernel_stopped = 1; } } static void stop_kernel(struct lockspace *ls, uint32_t seq) { log_group(ls, "%s seq %u", __func__, seq); cpg_stop_kernel(ls); } /* the first condition is that the local lockspace is stopped which we don't need to check for because stop_kernel(), which is synchronous, was done when the change was created */ /* the fencing/quorum/fs conditions need to account for all the changes that have occured since the last change applied to dlm-kernel, not just the latest change */ /* we know that the cluster_quorate value here is consistent with the cpg events because the ringid's are in sync per the check_ringid_done */ static int wait_conditions_done(struct lockspace *ls) { if (!check_ringid_done(ls)) { if (ls->wait_debug != DLMC_LS_WAIT_RINGID) { ls->wait_debug = DLMC_LS_WAIT_RINGID; ls->wait_retry = 0; } ls->wait_retry++; /* the check function logs a message */ poll_lockspaces++; return 0; } if (opt(enable_quorum_lockspace_ind) && !cluster_quorate) { if (ls->wait_debug != DLMC_LS_WAIT_QUORUM) { ls->wait_debug = DLMC_LS_WAIT_QUORUM; ls->wait_retry = 0; } ls->wait_retry++; log_retry(ls, "wait for quorum"); poll_lockspaces++; return 0; } if (!check_fencing_done(ls)) { if (ls->wait_debug != DLMC_LS_WAIT_FENCING) { ls->wait_debug = DLMC_LS_WAIT_FENCING; ls->wait_retry = 0; } ls->wait_retry++; log_retry(ls, "wait for fencing"); poll_lockspaces++; return 0; } if (!check_fs_done(ls)) { if (ls->wait_debug != DLMC_LS_WAIT_FSDONE) { ls->wait_debug = DLMC_LS_WAIT_FSDONE; ls->wait_retry = 0; } ls->wait_retry++; log_retry(ls, "wait for fsdone"); poll_fs++; return 0; } ls->wait_debug = 0; ls->wait_retry = 0; return 1; } static int wait_messages_done(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); struct member *memb; int need = 0, total = 0; list_for_each_entry(memb, &cg->members, list) { if (!memb->start) need++; total++; } if (need) { log_group(ls, "wait_messages cg %u need %d of %d", cg->seq, need, total); ls->wait_debug = need; return 0; } log_group(ls, "wait_messages cg %u got all %d", cg->seq, total); ls->wait_debug = 0; return 1; } static void cleanup_changes(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); struct change *safe; list_del(&cg->list); if (ls->started_change) free_cg(ls->started_change); ls->started_change = cg; ls->started_count++; if (!ls->started_count) ls->started_count++; cg->combined_seq = cg->seq; /* for queries */ list_for_each_entry_safe(cg, safe, &ls->changes, list) { ls->started_change->combined_seq = cg->seq; /* for queries */ list_del(&cg->list); free_cg(cg); } } /* There's a stream of confchg and messages. At one of these messages, the low node needs to store plocks and new nodes need to begin saving plock messages. A second message is needed to say that the plocks are ready to be read. When the last start message is recvd for a change, the low node stores plocks and the new nodes begin saving messages. When the store is done, low node sends plocks_stored message. When new nodes recv this, they read the plocks and their saved messages. plocks_stored message should identify a specific change, like start messages do; if it doesn't match ls->started_change, then it's ignored. If a confchg adding a new node arrives after plocks are stored but before plocks_stored msg recvd, then the message is ignored. The low node will send another plocks_stored message for the latest change (although it may be able to reuse the ckpt if no plock state has changed). */ static void set_plock_data_node(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); struct member *memb; int low = 0; list_for_each_entry(memb, &cg->members, list) { if (!(memb->start_flags & DLM_MFLG_HAVEPLOCK)) continue; if (!low || memb->nodeid < low) low = memb->nodeid; } log_dlock(ls, "set_plock_data_node from %d to %d", ls->plock_data_node, low); ls->plock_data_node = low; } static struct id_info *get_id_struct(struct id_info *ids, int count, int size, int nodeid) { struct id_info *id = ids; int i; for (i = 0; i < count; i++) { if (id->nodeid == nodeid) return id; id = (struct id_info *)((char *)id + size); } return NULL; } /* do the change details in the message match the details of the given change */ static int match_change(struct lockspace *ls, struct change *cg, struct dlm_header *hd, struct ls_info *li, struct id_info *ids) { struct id_info *id; struct member *memb; struct node *node; uint64_t t; uint32_t seq = hd->msgdata; int i, members_mismatch; /* We can ignore messages if we're not in the list of members. The one known time this will happen is after we've joined the cpg, we can get messages for changes prior to the change in which we're added. */ id = get_id_struct(ids, li->id_info_count, li->id_info_size,our_nodeid); if (!id) { log_group(ls, "match_change %d:%u skip %u we are not in members", hd->nodeid, seq, cg->seq); return 0; } memb = find_memb(cg, hd->nodeid); if (!memb) { log_group(ls, "match_change %d:%u skip %u sender not member", hd->nodeid, seq, cg->seq); return 0; } if (memb->start_flags & DLM_MFLG_NACK) { log_group(ls, "match_change %d:%u skip %u is nacked", hd->nodeid, seq, cg->seq); return 0; } if (memb->start && hd->type == DLM_MSG_START) { log_group(ls, "match_change %d:%u skip %u already start", hd->nodeid, seq, cg->seq); return 0; } /* a node's start can't match a change if the node joined the cluster more recently than the change was created */ node = get_node_history(ls, hd->nodeid); if (!node) { log_group(ls, "match_change %d:%u skip cg %u no node history", hd->nodeid, seq, cg->seq); return 0; } t = cluster_add_time(node->nodeid); if (t > cg->create_time) { log_group(ls, "match_change %d:%u skip cg %u created %llu " "cluster add %llu", hd->nodeid, seq, cg->seq, (unsigned long long)cg->create_time, (unsigned long long)t); /* nacks can apply to older cg's */ if (!(hd->flags & DLM_MFLG_NACK)) { return 0; } else { log_group(ls, "match_change %d:%u unskip cg %u for nack", hd->nodeid, seq, cg->seq); } } if (node->last_match_seq > cg->seq) { log_group(ls, "match_change %d:%u skip cg %u last matched cg %u", hd->nodeid, seq, cg->seq, node->last_match_seq); return 0; } /* verify this is the right change by matching the counts and the nodeids of the current members */ if (li->member_count != cg->member_count || li->joined_count != cg->joined_count || li->remove_count != cg->remove_count || li->failed_count != cg->failed_count) { log_group(ls, "match_change %d:%u skip %u expect counts " "%d %d %d %d", hd->nodeid, seq, cg->seq, cg->member_count, cg->joined_count, cg->remove_count, cg->failed_count); return 0; } members_mismatch = 0; id = ids; for (i = 0; i < li->id_info_count; i++) { memb = find_memb(cg, id->nodeid); if (!memb) { log_group(ls, "match_change %d:%u skip %u no memb %d", hd->nodeid, seq, cg->seq, id->nodeid); members_mismatch = 1; break; } id = (struct id_info *)((char *)id + li->id_info_size); } if (members_mismatch) return 0; /* Not completely sure if this is a valid assertion or not, i.e. not sure if we really never want to nack our first and only cg. I have seen one case in which a node incorrectly accepted nacks for cg seq 1 and ls change_seq 1. (It was the secondary effect of another bug.) Or, it's possible that this should apply a little more broadly as: don't nack our most recent cg, i.e. cg->seq == ls->change_seq (1 or otherwise). I'm hoping to find a test case that will exercise this to clarify the situation here, and then update this comment. */ if (cg->seq == 1 && ls->change_seq == 1 && (hd->flags & DLM_MFLG_NACK)) { log_group(ls, "match_change %d:%u skip cg %u for nack", hd->nodeid, seq, cg->seq); return 0; } node->last_match_seq = cg->seq; log_group(ls, "match_change %d:%u matches cg %u", hd->nodeid, seq, cg->seq); return 1; } /* Unfortunately, there's no really simple way to match a message with the specific change that it was sent for. We hope that by passing all the details of the change in the message, we will be able to uniquely match the it to the correct change. */ /* A start message will usually be for the first (current) change on our list. In some cases it will be for a non-current change, and we can ignore it: 1. A,B,C get confchg1 adding C 2. C sends start for confchg1 3. A,B,C get confchg2 adding D 4. A,B,C,D recv start from C for confchg1 - ignored 5. C,D send start for confchg2 6. A,B send start for confchg2 7. A,B,C,D recv all start messages for confchg2, and start kernel In step 4, how do the nodes know whether the start message from C is for confchg1 or confchg2? Hopefully by comparing the counts and members. */ static struct change *find_change(struct lockspace *ls, struct dlm_header *hd, struct ls_info *li, struct id_info *ids) { struct change *cg; list_for_each_entry_reverse(cg, &ls->changes, list) { if (!match_change(ls, cg, hd, li, ids)) continue; return cg; } log_group(ls, "find_change %d:%u no match", hd->nodeid, hd->msgdata); return NULL; } static int is_added(struct lockspace *ls, int nodeid) { struct change *cg; struct member *memb; list_for_each_entry(cg, &ls->changes, list) { memb = find_memb(cg, nodeid); if (memb && memb->added) return 1; } return 0; } static void receive_start(struct lockspace *ls, struct dlm_header *hd, int len) { struct change *cg; struct member *memb; struct ls_info *li; struct id_info *ids; uint32_t seq = hd->msgdata; int added; log_group(ls, "receive_start %d:%u len %d", hd->nodeid, seq, len); li = (struct ls_info *)((char *)hd + sizeof(struct dlm_header)); ids = (struct id_info *)((char *)li + sizeof(struct ls_info)); ls_info_in(li); ids_in(li, ids); cg = find_change(ls, hd, li, ids); if (!cg) return; memb = find_memb(cg, hd->nodeid); if (!memb) { /* this should never happen since match_change checks it */ log_error("receive_start no member %d", hd->nodeid); return; } memb->start_flags = hd->flags; added = is_added(ls, hd->nodeid); if (added && li->started_count && ls->started_count) { log_error("receive_start %d:%u add node with started_count %u", hd->nodeid, seq, li->started_count); /* see comment in fence/fenced/cpg.c */ memb->disallowed = 1; return; } if (memb->start_flags & DLM_MFLG_NACK) { log_group(ls, "receive_start %d:%u is NACK", hd->nodeid, seq); return; } node_history_start(ls, hd->nodeid); memb->start = 1; } static void receive_plocks_done(struct lockspace *ls, struct dlm_header *hd, int len) { struct ls_info *li; struct id_info *ids; log_dlock(ls, "receive_plocks_done %d:%u flags %x plocks_data %u need %d save %d", hd->nodeid, hd->msgdata, hd->flags, hd->msgdata2, ls->need_plocks, ls->save_plocks); if (!ls->need_plocks) return; if (ls->need_plocks && !ls->save_plocks) return; if (!ls->started_change) { /* don't think this should happen */ log_elock(ls, "receive_plocks_done %d:%u no started_change", hd->nodeid, hd->msgdata); return; } li = (struct ls_info *)((char *)hd + sizeof(struct dlm_header)); ids = (struct id_info *)((char *)li + sizeof(struct ls_info)); ls_info_in(li); ids_in(li, ids); if (!match_change(ls, ls->started_change, hd, li, ids)) { /* don't think this should happen */ log_elock(ls, "receive_plocks_done %d:%u no match_change", hd->nodeid, hd->msgdata); /* remove/free anything we've saved from receive_plocks_data messages that weren't for us */ clear_plocks_data(ls); return; } if (ls->recv_plocks_data_count != hd->msgdata2) { log_elock(ls, "receive_plocks_done plocks_data %u recv %u", hd->msgdata2, ls->recv_plocks_data_count); } process_saved_plocks(ls); ls->need_plocks = 0; ls->save_plocks = 0; log_dlock(ls, "receive_plocks_done %d:%u plocks_data_count %u", hd->nodeid, hd->msgdata, ls->recv_plocks_data_count); } static void send_info(struct lockspace *ls, struct change *cg, int type, uint32_t flags, uint32_t msgdata2) { struct dlm_header *hd; struct ls_info *li; struct id_info *id; struct member *memb; char *buf; int len, id_count; id_count = cg->member_count; len = sizeof(struct dlm_header) + sizeof(struct ls_info) + id_count * sizeof(struct id_info); buf = malloc(len); if (!buf) { log_error("send_info len %d no mem", len); return; } memset(buf, 0, len); hd = (struct dlm_header *)buf; li = (struct ls_info *)(buf + sizeof(*hd)); id = (struct id_info *)(buf + sizeof(*hd) + sizeof(*li)); /* fill in header (dlm_send_message handles part of header) */ hd->type = type; hd->msgdata = cg->seq; hd->flags = flags; hd->msgdata2 = msgdata2; if (ls->joining) hd->flags |= DLM_MFLG_JOINING; if (!ls->need_plocks) hd->flags |= DLM_MFLG_HAVEPLOCK; /* fill in ls_info */ li->ls_info_size = cpu_to_le32(sizeof(struct ls_info)); li->id_info_size = cpu_to_le32(sizeof(struct id_info)); li->id_info_count = cpu_to_le32(id_count); li->started_count = cpu_to_le32(ls->started_count); li->member_count = cpu_to_le32(cg->member_count); li->joined_count = cpu_to_le32(cg->joined_count); li->remove_count = cpu_to_le32(cg->remove_count); li->failed_count = cpu_to_le32(cg->failed_count); /* fill in id_info entries */ list_for_each_entry(memb, &cg->members, list) { id->nodeid = cpu_to_le32(memb->nodeid); id++; } dlm_send_message(ls, buf, len); free(buf); } /* fenced used the DUPLICATE_CG flag instead of sending nacks like we do here. I think the nacks didn't work for fenced for some reason, but I don't remember why (possibly because the node blocked doing the fencing hadn't created the cg to nack yet). */ static void send_start(struct lockspace *ls, struct change *cg) { log_group(ls, "send_start %d:%u counts %u %d %d %d %d", our_nodeid, cg->seq, ls->started_count, cg->member_count, cg->joined_count, cg->remove_count, cg->failed_count); send_info(ls, cg, DLM_MSG_START, 0, 0); } static void send_plocks_done(struct lockspace *ls, struct change *cg, uint32_t plocks_data) { log_dlock(ls, "send_plocks_done %d:%u counts %u %d %d %d %d plocks_data %u", our_nodeid, cg->seq, ls->started_count, cg->member_count, cg->joined_count, cg->remove_count, cg->failed_count, plocks_data); send_info(ls, cg, DLM_MSG_PLOCKS_DONE, 0, plocks_data); } static int same_members(struct change *cg1, struct change *cg2) { struct member *memb; list_for_each_entry(memb, &cg1->members, list) { if (!find_memb(cg2, memb->nodeid)) return 0; } return 1; } static void send_nacks(struct lockspace *ls, struct change *startcg) { struct change *cg; list_for_each_entry(cg, &ls->changes, list) { if (cg->seq < startcg->seq && cg->member_count == startcg->member_count && cg->joined_count == startcg->joined_count && cg->remove_count == startcg->remove_count && cg->failed_count == startcg->failed_count && same_members(cg, startcg)) { log_group(ls, "send nack old cg %u new cg %u", cg->seq, startcg->seq); send_info(ls, cg, DLM_MSG_START, DLM_MFLG_NACK, 0); } } } static int nodes_added(struct lockspace *ls) { struct change *cg; list_for_each_entry(cg, &ls->changes, list) { if (cg->joined_count) return 1; } return 0; } static void prepare_plocks(struct lockspace *ls) { struct change *cg = list_first_entry(&ls->changes, struct change, list); uint32_t plocks_data = 0; struct member *memb; if (!opt(enable_plock_ind) || ls->disable_plock) return; log_dlock(ls, "prepare_plocks"); /* if we're the only node in the lockspace, then we are the data_node and we don't need plocks */ if (cg->member_count == 1) { list_for_each_entry(memb, &cg->members, list) { if (memb->nodeid != our_nodeid) { log_elock(ls, "prepare_plocks other member %d", memb->nodeid); } } ls->plock_data_node = our_nodeid; ls->need_plocks = 0; return; } /* the low node that indicated it had plock state in its last start message is the data_node */ set_plock_data_node(ls); /* there is no node with plock state, so there's no syncing to do */ if (!ls->plock_data_node) { ls->need_plocks = 0; ls->save_plocks = 0; return; } /* We save all plock messages received after our own confchg and apply them after we receive the plocks_done message from the data_node. */ if (ls->need_plocks) { log_dlock(ls, "save_plocks start"); ls->save_plocks = 1; return; } if (ls->plock_data_node != our_nodeid) return; if (nodes_added(ls)) send_all_plocks_data(ls, cg->seq, &plocks_data); send_plocks_done(ls, cg, plocks_data); } static void apply_changes(struct lockspace *ls) { struct change *cg; if (list_empty(&ls->changes)) return; cg = list_first_entry(&ls->changes, struct change, list); switch (cg->state) { case CGST_WAIT_CONDITIONS: if (wait_conditions_done(ls)) { send_nacks(ls, cg); send_start(ls, cg); cg->state = CGST_WAIT_MESSAGES; } break; case CGST_WAIT_MESSAGES: if (wait_messages_done(ls)) { set_protocol_stateful(); start_kernel(ls); prepare_plocks(ls); cleanup_changes(ls); } break; default: log_error("apply_changes invalid state %d", cg->state); } } void process_lockspace_changes(void) { struct lockspace *ls, *safe; poll_lockspaces = 0; poll_fs = 0; list_for_each_entry_safe(ls, safe, &lockspaces, list) { if (!list_empty(&ls->changes)) apply_changes(ls); } } static int add_change(struct lockspace *ls, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries, struct change **cg_out) { struct change *cg; struct member *memb; int i, error; uint64_t now = monotime(); cg = malloc(sizeof(struct change)); if (!cg) goto fail_nomem; memset(cg, 0, sizeof(struct change)); INIT_LIST_HEAD(&cg->members); INIT_LIST_HEAD(&cg->removed); cg->state = CGST_WAIT_CONDITIONS; cg->create_time = now; cg->seq = ++ls->change_seq; if (!cg->seq) cg->seq = ++ls->change_seq; cg->member_count = member_list_entries; cg->joined_count = joined_list_entries; cg->remove_count = left_list_entries; for (i = 0; i < member_list_entries; i++) { memb = malloc(sizeof(struct member)); if (!memb) goto fail_nomem; memset(memb, 0, sizeof(struct member)); memb->nodeid = member_list[i].nodeid; list_add_tail(&memb->list, &cg->members); } for (i = 0; i < left_list_entries; i++) { memb = malloc(sizeof(struct member)); if (!memb) goto fail_nomem; memset(memb, 0, sizeof(struct member)); memb->nodeid = left_list[i].nodeid; if (left_list[i].reason == CPG_REASON_NODEDOWN || left_list[i].reason == CPG_REASON_PROCDOWN) { memb->failed = 1; cg->failed_count++; } list_add_tail(&memb->list, &cg->removed); if (left_list[i].reason == CPG_REASON_NODEDOWN) ls->cpg_ringid_wait = 1; if (memb->failed) { node_history_lockspace_fail(ls, memb->nodeid, cg, left_list[i].reason, now); } else { node_history_lockspace_left(ls, memb->nodeid, cg, now); } log_group(ls, "add_change cg %u remove nodeid %d reason %s", cg->seq, memb->nodeid, reason_str(left_list[i].reason)); if (left_list[i].reason == CPG_REASON_PROCDOWN) kick_node_from_cluster(memb->nodeid); } for (i = 0; i < joined_list_entries; i++) { memb = find_memb(cg, joined_list[i].nodeid); if (!memb) { log_error("no member %d", joined_list[i].nodeid); error = -ENOENT; goto fail; } memb->added = 1; if (memb->nodeid == our_nodeid) { cg->we_joined = 1; } else { node_history_lockspace_add(ls, memb->nodeid, cg, now); } log_group(ls, "add_change cg %u joined nodeid %d", cg->seq, memb->nodeid); } if (cg->we_joined) { log_group(ls, "add_change cg %u we joined", cg->seq); list_for_each_entry(memb, &cg->members, list) { node_history_lockspace_add(ls, memb->nodeid, cg, now); } } log_group(ls, "add_change cg %u counts member %d joined %d remove %d " "failed %d", cg->seq, cg->member_count, cg->joined_count, cg->remove_count, cg->failed_count); list_add(&cg->list, &ls->changes); *cg_out = cg; return 0; fail_nomem: log_error("no memory"); error = -ENOMEM; fail: free_cg(cg); return error; } static int we_left(const struct cpg_address *left_list, size_t left_list_entries) { int i; for (i = 0; i < left_list_entries; i++) { if (left_list[i].nodeid == our_nodeid) return 1; } return 0; } static void confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { struct lockspace *ls; struct change *cg; struct member *memb; int rv; log_config(group_name, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries); ls = find_ls_handle(handle); if (!ls) { log_error("confchg_cb no lockspace for cpg %s", group_name->value); return; } if (ls->leaving && we_left(left_list, left_list_entries)) { /* we called cpg_leave(), and this should be the final cpg callback we receive */ log_group(ls, "confchg for our leave"); stop_kernel(ls, 0); set_configfs_members(ls, ls->name, 0, NULL, 0, NULL); set_sysfs_event_done(ls->name, 0); cpg_finalize(ls->cpg_handle); client_dead(ls->cpg_client); purge_plocks(ls, our_nodeid, 1); list_del(&ls->list); free_ls(ls); return; } rv = add_change(ls, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, &cg); if (rv) return; stop_kernel(ls, cg->seq); list_for_each_entry(memb, &cg->removed, list) purge_plocks(ls, memb->nodeid, 0); apply_changes(ls); #if 0 deadlk_confchg(ls, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries); #endif } /* after our join confchg, we want to ignore plock messages (see need_plocks checks below) until the point in time where the ckpt_node saves plock state (final start message received); at this time we want to shift from ignoring plock messages to saving plock messages to apply on top of the plock state that we read. */ static void deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name, uint32_t nodeid, uint32_t pid, void *data, size_t len) { struct lockspace *ls; struct dlm_header *hd; int ignore_plock; int rv; int enable_plock = opt(enable_plock_ind); int plock_ownership = opt(plock_ownership_ind); ls = find_ls_handle(handle); if (!ls) { log_error("deliver_cb no ls for cpg %s", group_name->value); return; } if (len < sizeof(struct dlm_header)) { log_error("deliver_cb short message %zd", len); return; } hd = (struct dlm_header *)data; dlm_header_in(hd); rv = dlm_header_validate(hd, nodeid); if (rv < 0) return; ignore_plock = 0; switch (hd->type) { case DLM_MSG_START: receive_start(ls, hd, len); break; case DLM_MSG_PLOCK: if (ls->disable_plock) break; if (ls->need_plocks && !ls->save_plocks) { ignore_plock = 1; break; } if (enable_plock) receive_plock(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d", hd->type, nodeid, enable_plock); break; case DLM_MSG_PLOCK_OWN: if (ls->disable_plock) break; if (ls->need_plocks && !ls->save_plocks) { ignore_plock = 1; break; } if (enable_plock && plock_ownership) receive_own(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d owner %d", hd->type, nodeid, enable_plock, plock_ownership); break; case DLM_MSG_PLOCK_DROP: if (ls->disable_plock) break; if (ls->need_plocks && !ls->save_plocks) { ignore_plock = 1; break; } if (enable_plock && plock_ownership) receive_drop(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d owner %d", hd->type, nodeid, enable_plock, plock_ownership); break; case DLM_MSG_PLOCK_SYNC_LOCK: case DLM_MSG_PLOCK_SYNC_WAITER: if (ls->disable_plock) break; if (ls->need_plocks && !ls->save_plocks) { ignore_plock = 1; break; } if (enable_plock && plock_ownership) receive_sync(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d owner %d", hd->type, nodeid, enable_plock, plock_ownership); break; case DLM_MSG_PLOCKS_DATA: if (ls->disable_plock) break; if (enable_plock) receive_plocks_data(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d", hd->type, nodeid, enable_plock); break; case DLM_MSG_PLOCKS_DONE: if (ls->disable_plock) break; if (enable_plock) receive_plocks_done(ls, hd, len); else log_error("msg %d nodeid %d enable_plock %d", hd->type, nodeid, enable_plock); break; #if 0 case DLM_MSG_DEADLK_CYCLE_START: if (opt(enable_deadlk)) receive_cycle_start(ls, hd, len); else log_error("msg %d nodeid %d enable_deadlk %d", hd->type, nodeid, opt(enable_deadlk)); break; case DLM_MSG_DEADLK_CYCLE_END: if (opt(enable_deadlk)) receive_cycle_end(ls, hd, len); else log_error("msg %d nodeid %d enable_deadlk %d", hd->type, nodeid, opt(enable_deadlk)); break; case DLM_MSG_DEADLK_CHECKPOINT_READY: if (opt(enable_deadlk)) receive_checkpoint_ready(ls, hd, len); else log_error("msg %d nodeid %d enable_deadlk %d", hd->type, nodeid, opt(enable_deadlk)); break; case DLM_MSG_DEADLK_CANCEL_LOCK: if (opt(enable_deadlk)) receive_cancel_lock(ls, hd, len); else log_error("msg %d nodeid %d enable_deadlk %d", hd->type, nodeid, opt(enable_deadlk)); break; #endif default: log_error("unknown msg type %d", hd->type); } if (ignore_plock) log_plock(ls, "msg %s nodeid %d need_plock ignore", msg_name(hd->type), nodeid); apply_changes(ls); } /* save ringid to compare with cman's. also save member_list to double check with cman's member list? they should match */ static void totem_cb(cpg_handle_t handle, struct cpg_ring_id ring_id, uint32_t member_list_entries, const uint32_t *member_list) { struct lockspace *ls; char name[128]; ls = find_ls_handle(handle); if (!ls) { log_error("totem_cb no lockspace for handle"); return; } memset(&name, 0, sizeof(name)); sprintf(name, "dlm:ls:%s", ls->name); log_ringid(name, &ring_id, member_list, member_list_entries); ls->cpg_ringid.nodeid = ring_id.nodeid; ls->cpg_ringid.seq = ring_id.seq; ls->cpg_ringid_wait = 0; apply_changes(ls); } static cpg_model_v1_data_t cpg_callbacks = { .cpg_deliver_fn = deliver_cb, .cpg_confchg_fn = confchg_cb, .cpg_totem_confchg_fn = totem_cb, .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, }; static void process_cpg_lockspace(int ci) { struct lockspace *ls; cs_error_t error; ls = find_ls_ci(ci); if (!ls) { log_error("process_lockspace_cpg no lockspace for ci %d", ci); return; } error = cpg_dispatch(ls->cpg_handle, CS_DISPATCH_ALL); if (error != CS_OK && error != CS_ERR_BAD_HANDLE) { log_error("cpg_dispatch error %d", error); return; } } /* received an "online" uevent from dlm-kernel */ int dlm_join_lockspace(struct lockspace *ls) { cs_error_t error; cpg_handle_t h; struct cpg_name name; int i = 0, fd, ci, rv; error = cpg_model_initialize(&h, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_callbacks, NULL); if (error != CS_OK) { log_error("cpg_model_initialize error %d", error); rv = -1; goto fail_free; } cpg_fd_get(h, &fd); ci = client_add(fd, process_cpg_lockspace, NULL); list_add(&ls->list, &lockspaces); ls->cpg_handle = h; ls->cpg_client = ci; ls->cpg_fd = fd; ls->kernel_stopped = 1; ls->need_plocks = 1; ls->joining = 1; memset(&name, 0, sizeof(name)); sprintf(name.value, "dlm:ls:%s", ls->name); name.length = strlen(name.value) + 1; /* TODO: allow global_id to be set in cluster.conf? */ ls->global_id = cpgname_to_crc(name.value, name.length); log_group(ls, "cpg_join %s ...", name.value); retry: error = cpg_join(h, &name); if (error == CS_ERR_TRY_AGAIN) { sleep(1); if (!(++i % 10)) log_error("cpg_join error retrying"); goto retry; } if (error != CS_OK) { log_error("cpg_join error %d", error); cpg_finalize(h); rv = -1; goto fail; } return 0; fail: list_del(&ls->list); client_dead(ci); cpg_finalize(h); fail_free: set_sysfs_event_done(ls->name, rv); free_ls(ls); return rv; } /* received an "offline" uevent from dlm-kernel */ int dlm_leave_lockspace(struct lockspace *ls) { cs_error_t error; struct cpg_name name; int i = 0; ls->leaving = 1; memset(&name, 0, sizeof(name)); sprintf(name.value, "dlm:ls:%s", ls->name); name.length = strlen(name.value) + 1; retry: error = cpg_leave(ls->cpg_handle, &name); if (error == CS_ERR_TRY_AGAIN) { sleep(1); if (!(++i % 10)) log_error("cpg_leave error retrying"); goto retry; } if (error != CS_OK) log_error("cpg_leave error %d", error); return 0; } int set_fs_notified(struct lockspace *ls, int nodeid) { struct node *node; /* this shouldn't happen */ node = get_node_history(ls, nodeid); if (!node) { log_error("set_fs_notified no nodeid %d", nodeid); return -ESRCH; } if (!find_memb(ls->started_change, nodeid)) { log_group(ls, "set_fs_notified %d not in ls", nodeid); return 0; } /* this can happen, we haven't seen a nodedown for this node yet, but we should soon */ if (!node->check_fs) { log_group(ls, "set_fs_notified %d zero check_fs", nodeid); return -EAGAIN; } log_group(ls, "set_fs_notified nodeid %d", nodeid); node->fs_notified = 1; return 0; } int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace) { struct change *cg, *last = NULL; strncpy(lockspace->name, ls->name, DLM_LOCKSPACE_LEN + 1); lockspace->name[DLM_LOCKSPACE_LEN] = '\0'; lockspace->global_id = ls->global_id; if (ls->joining) lockspace->flags |= DLMC_LF_JOINING; if (ls->leaving) lockspace->flags |= DLMC_LF_LEAVING; if (ls->kernel_stopped) lockspace->flags |= DLMC_LF_KERNEL_STOPPED; if (ls->fs_registered) lockspace->flags |= DLMC_LF_FS_REGISTERED; if (ls->need_plocks) lockspace->flags |= DLMC_LF_NEED_PLOCKS; if (ls->save_plocks) lockspace->flags |= DLMC_LF_SAVE_PLOCKS; if (!ls->started_change) goto next; cg = ls->started_change; lockspace->cg_prev.member_count = cg->member_count; lockspace->cg_prev.joined_count = cg->joined_count; lockspace->cg_prev.remove_count = cg->remove_count; lockspace->cg_prev.failed_count = cg->failed_count; lockspace->cg_prev.combined_seq = cg->combined_seq; lockspace->cg_prev.seq = cg->seq; next: if (list_empty(&ls->changes)) goto out; list_for_each_entry(cg, &ls->changes, list) last = cg; cg = list_first_entry(&ls->changes, struct change, list); lockspace->cg_next.member_count = cg->member_count; lockspace->cg_next.joined_count = cg->joined_count; lockspace->cg_next.remove_count = cg->remove_count; lockspace->cg_next.failed_count = cg->failed_count; lockspace->cg_next.combined_seq = last->seq; lockspace->cg_next.seq = cg->seq; lockspace->cg_next.wait_condition = ls->wait_debug; if (cg->state == CGST_WAIT_MESSAGES) lockspace->cg_next.wait_messages = 1; out: return 0; } static int _set_node_info(struct lockspace *ls, struct change *cg, int nodeid, struct dlmc_node *node) { struct member *m = NULL; struct node *n; node->nodeid = nodeid; if (cg) m = find_memb(cg, nodeid); if (!m) goto history; node->flags |= DLMC_NF_MEMBER; if (m->start) node->flags |= DLMC_NF_START; if (m->disallowed) node->flags |= DLMC_NF_DISALLOWED; history: n = get_node_history(ls, nodeid); if (!n) goto out; if (n->need_fencing) node->flags |= DLMC_NF_NEED_FENCING; if (n->check_fs) node->flags |= DLMC_NF_CHECK_FS; node->added_seq = n->lockspace_add_seq; node->removed_seq = n->lockspace_rem_seq; node->fail_reason = n->lockspace_fail_reason; node->fail_walltime = n->fail_walltime; node->fail_monotime = n->fail_monotime; out: return 0; } int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node) { struct change *cg; if (!list_empty(&ls->changes)) { cg = list_first_entry(&ls->changes, struct change, list); return _set_node_info(ls, cg, nodeid, node); } return _set_node_info(ls, ls->started_change, nodeid, node); } int set_lockspaces(int *count, struct dlmc_lockspace **lss_out) { struct lockspace *ls; struct dlmc_lockspace *lss, *lsp; int ls_count = 0; list_for_each_entry(ls, &lockspaces, list) ls_count++; lss = malloc(ls_count * sizeof(struct dlmc_lockspace)); if (!lss) return -ENOMEM; memset(lss, 0, ls_count * sizeof(struct dlmc_lockspace)); lsp = lss; list_for_each_entry(ls, &lockspaces, list) { set_lockspace_info(ls, lsp++); } *count = ls_count; *lss_out = lss; return 0; } int set_lockspace_nodes(struct lockspace *ls, int option, int *node_count, struct dlmc_node **nodes_out) { struct change *cg; struct node *n; struct dlmc_node *nodes = NULL, *nodep; struct member *memb; int count = 0; if (option == DLMC_NODES_ALL) { if (!list_empty(&ls->changes)) cg = list_first_entry(&ls->changes, struct change,list); else cg = ls->started_change; list_for_each_entry(n, &ls->node_history, list) count++; } else if (option == DLMC_NODES_MEMBERS) { if (!ls->started_change) goto out; cg = ls->started_change; count = cg->member_count; } else if (option == DLMC_NODES_NEXT) { if (list_empty(&ls->changes)) goto out; cg = list_first_entry(&ls->changes, struct change, list); count = cg->member_count; } else goto out; nodes = malloc(count * sizeof(struct dlmc_node)); if (!nodes) return -ENOMEM; memset(nodes, 0, count * sizeof(struct dlmc_node)); nodep = nodes; if (option == DLMC_NODES_ALL) { list_for_each_entry(n, &ls->node_history, list) _set_node_info(ls, cg, n->nodeid, nodep++); } else { list_for_each_entry(memb, &cg->members, list) _set_node_info(ls, cg, memb->nodeid, nodep++); } out: *node_count = count; *nodes_out = nodes; return 0; } dlm-4.3.0/dlm_controld/crc.c000066400000000000000000000103451461150666200156770ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" static const uint32_t crc_32_tab[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; /** * * Copied from: * * gfs2_disk_hash - hash an array of data * @data: the data to be hashed * @len: the length of data to be hashed * * This function must produce the same results as the one in the kernel: * crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF * * Take some data and convert it to a 32-bit hash. * * The hash function is a 32-bit CRC of the data. The algorithm uses * the crc_32_tab table above. * * This may not be the fastest hash function, but it does a fair bit better * at providing uniform results than the others I've looked at. That's * really important for efficient directories. * * Returns: the hash */ uint32_t cpgname_to_crc(const char *data, int len) { uint32_t hash = 0xFFFFFFFF; for (; len--; data++) hash = crc_32_tab[(hash ^ *data) & 0xFF] ^ (hash >> 8); hash = ~hash; return hash; } dlm-4.3.0/dlm_controld/daemon_cpg.c000066400000000000000000002123021461150666200172210ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" /* protocol_version flags */ #define PV_STATEFUL 0x0001 /* retries are once a second */ #define log_retry(cur_count, fmt, args...) ({ \ if (cur_count < 60) \ log_debug(fmt, ##args); \ else if (cur_count == 60) \ log_error(fmt, ##args); \ else if (!(cur_count % 3600)) \ log_error(fmt, ##args); \ }) struct protocol_version { uint16_t major; uint16_t minor; uint16_t patch; uint16_t flags; }; struct protocol { union { struct protocol_version dm_ver; uint16_t daemon_max[4]; }; union { struct protocol_version km_ver; uint16_t kernel_max[4]; }; union { struct protocol_version dr_ver; uint16_t daemon_run[4]; }; union { struct protocol_version kr_ver; uint16_t kernel_run[4]; }; }; /* fence_result flags */ #define FR_FIPU 0x00000001 #define FR_CLEAR_STARTUP 0x00000002 #define FR_CLEAR_FIPU 0x00000004 struct fence_result { uint32_t version; uint32_t flags; uint32_t nodeid; uint32_t result; uint64_t fence_walltime; char unused[1000]; }; struct node_daemon { struct list_head list; int nodeid; int killed; int daemon_member; int left_reason; int recover_setup; int fence_in_progress_unknown; int need_fence_clear; int need_fencing; int delay_fencing; int stateful_merge; int fence_pid; int fence_pid_wait; int fence_result_wait; int fence_actor_done; /* for status/debug */ int fence_actor_last; /* for status/debug */ int fence_actors[MAX_NODES]; int fence_actors_orig[MAX_NODES]; struct protocol proto; struct fence_config fence_config; uint64_t daemon_add_time; uint64_t daemon_rem_time; uint64_t fail_walltime; uint64_t fail_monotime; uint64_t fence_walltime; uint64_t fence_monotime; }; #define REASON_STARTUP_FENCING -1 static cpg_handle_t cpg_handle_daemon; static int cpg_fd_daemon; static struct protocol our_protocol; static struct list_head daemon_nodes; static struct list_head startup_nodes; static struct cpg_address daemon_member[MAX_NODES]; static struct cpg_address daemon_joined[MAX_NODES]; static struct cpg_address daemon_remove[MAX_NODES]; static int daemon_member_count; static int daemon_joined_count; static int daemon_remove_count; static int daemon_ringid_wait; static struct cpg_ring_id daemon_ringid; static int daemon_fence_pid; static uint32_t last_join_seq; static uint32_t send_fipu_seq; static int wait_clear_fipu; static int fence_in_progress_unknown = 1; #define MAX_ZOMBIES 16 static int zombie_pids[MAX_ZOMBIES]; static int zombie_count; static int fence_result_pid; static unsigned int fence_result_try; static int stateful_merge_wait; /* cluster is stuck in waiting for manual intervention */ static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime); static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime); void log_config(const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { char m_buf[128]; char j_buf[32]; char l_buf[32]; size_t i, len, pos; int ret; memset(m_buf, 0, sizeof(m_buf)); memset(j_buf, 0, sizeof(j_buf)); memset(l_buf, 0, sizeof(l_buf)); len = sizeof(m_buf); pos = 0; for (i = 0; i < member_list_entries; i++) { ret = snprintf(m_buf + pos, len - pos, " %d", member_list[i].nodeid); if (ret >= len - pos) break; pos += ret; } len = sizeof(j_buf); pos = 0; for (i = 0; i < joined_list_entries; i++) { ret = snprintf(j_buf + pos, len - pos, " %d", joined_list[i].nodeid); if (ret >= len - pos) break; pos += ret; } len = sizeof(l_buf); pos = 0; for (i = 0; i < left_list_entries; i++) { ret = snprintf(l_buf + pos, len - pos, " %d", left_list[i].nodeid); if (ret >= len - pos) break; pos += ret; } log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value, member_list_entries, joined_list_entries, left_list_entries, strlen(m_buf) ? m_buf : " 0", strlen(j_buf) ? j_buf : " 0", strlen(l_buf) ? l_buf : " 0"); } void log_ringid(const char *name, struct cpg_ring_id *ringid, const uint32_t *member_list, size_t member_list_entries) { char m_buf[128]; size_t i, len, pos; int ret; memset(m_buf, 0, sizeof(m_buf)); len = sizeof(m_buf); pos = 0; for (i = 0; i < member_list_entries; i++) { ret = snprintf(m_buf + pos, len - pos, " %u", member_list[i]); if (ret >= len - pos) break; pos += ret; } log_debug("%s ring %u:%llu %zu memb%s", name, ringid->nodeid, (unsigned long long)ringid->seq, member_list_entries, m_buf); } const char *reason_str(int reason) { switch (reason) { case REASON_STARTUP_FENCING: return "startup"; case CPG_REASON_JOIN: return "join"; case CPG_REASON_LEAVE: return "leave"; case CPG_REASON_NODEDOWN: return "nodedown"; case CPG_REASON_NODEUP: return "nodeup"; case CPG_REASON_PROCDOWN: return "procdown"; default: return "unknown"; }; } const char *msg_name(int type) { switch (type) { case DLM_MSG_PROTOCOL: return "protocol"; case DLM_MSG_FENCE_RESULT: return "fence_result"; case DLM_MSG_FENCE_CLEAR: return "fence_clear"; case DLM_MSG_START: return "start"; case DLM_MSG_PLOCK: return "plock"; case DLM_MSG_PLOCK_OWN: return "plock_own"; case DLM_MSG_PLOCK_DROP: return "plock_drop"; case DLM_MSG_PLOCK_SYNC_LOCK: return "plock_sync_lock"; case DLM_MSG_PLOCK_SYNC_WAITER: return "plock_sync_waiter"; case DLM_MSG_PLOCKS_DATA: return "plocks_data"; case DLM_MSG_PLOCKS_DONE: return "plocks_done"; case DLM_MSG_DEADLK_CYCLE_START: return "deadlk_cycle_start"; case DLM_MSG_DEADLK_CYCLE_END: return "deadlk_cycle_end"; case DLM_MSG_DEADLK_CHECKPOINT_READY: return "deadlk_checkpoint_ready"; case DLM_MSG_DEADLK_CANCEL_LOCK: return "deadlk_cancel_lock"; default: return "unknown"; } } static int _send_message(cpg_handle_t h, void *buf, int len, int type) { struct iovec iov; cs_error_t error; int retries = 0; iov.iov_base = buf; iov.iov_len = len; retry: error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); if (error == CS_ERR_TRY_AGAIN) { retries++; usleep(1000); if (!(retries % 100)) log_error("cpg_mcast_joined retry %d %s", retries, msg_name(type)); goto retry; } if (error != CS_OK) { log_error("cpg_mcast_joined error %d handle %llx %s", error, (unsigned long long)h, msg_name(type)); return -1; } if (retries) log_debug("cpg_mcast_joined retried %d %s", retries, msg_name(type)); return 0; } /* header fields caller needs to set: type, to_nodeid, flags, msgdata */ void dlm_send_message(struct lockspace *ls, char *buf, int len) { struct dlm_header *hd = (struct dlm_header *) buf; int type = hd->type; hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); hd->type = cpu_to_le16(hd->type); hd->nodeid = cpu_to_le32(our_nodeid); hd->to_nodeid = cpu_to_le32(hd->to_nodeid); hd->global_id = cpu_to_le32(ls->global_id); hd->flags = cpu_to_le32(hd->flags); hd->msgdata = cpu_to_le32(hd->msgdata); hd->msgdata2 = cpu_to_le32(hd->msgdata2); _send_message(ls->cpg_handle, buf, len, type); } int dlm_send_message_daemon(char *buf, int len) { struct dlm_header *hd = (struct dlm_header *) buf; int type = hd->type; hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); hd->type = cpu_to_le16(hd->type); hd->nodeid = cpu_to_le32(our_nodeid); hd->to_nodeid = cpu_to_le32(hd->to_nodeid); hd->flags = cpu_to_le32(hd->flags); hd->msgdata = cpu_to_le32(hd->msgdata); hd->msgdata2 = cpu_to_le32(hd->msgdata2); return _send_message(cpg_handle_daemon, buf, len, type); } void dlm_header_in(struct dlm_header *hd) { hd->version[0] = le16_to_cpu(hd->version[0]); hd->version[1] = le16_to_cpu(hd->version[1]); hd->version[2] = le16_to_cpu(hd->version[2]); hd->type = le16_to_cpu(hd->type); hd->nodeid = le32_to_cpu(hd->nodeid); hd->to_nodeid = le32_to_cpu(hd->to_nodeid); hd->global_id = le32_to_cpu(hd->global_id); hd->flags = le32_to_cpu(hd->flags); hd->msgdata = le32_to_cpu(hd->msgdata); hd->msgdata2 = le32_to_cpu(hd->msgdata2); } static void run_info_out(struct run_info *info) { info->dest_nodeid = cpu_to_le32(info->dest_nodeid); info->start_nodeid = cpu_to_le32(info->start_nodeid); info->local_pid = cpu_to_le32(info->local_pid); info->local_result = cpu_to_le32(info->local_result); info->need_replies = cpu_to_le32(info->need_replies); info->reply_count = cpu_to_le32(info->reply_count); info->fail_count = cpu_to_le32(info->fail_count); info->flags = cpu_to_le32(info->flags); } static void run_info_in(struct run_info *info) { info->dest_nodeid = le32_to_cpu(info->dest_nodeid); info->start_nodeid = le32_to_cpu(info->start_nodeid); info->local_pid = le32_to_cpu(info->local_pid); info->local_result = le32_to_cpu(info->local_result); info->need_replies = le32_to_cpu(info->need_replies); info->reply_count = le32_to_cpu(info->reply_count); info->fail_count = le32_to_cpu(info->fail_count); info->flags = le32_to_cpu(info->flags); } static void run_request_out(struct run_request *req) { run_info_out(&req->info); } static void run_request_in(struct run_request *req) { run_info_in(&req->info); } static void run_reply_out(struct run_reply *rep) { run_info_out(&rep->info); } static void run_reply_in(struct run_reply *rep) { run_info_in(&rep->info); } int dlm_header_validate(struct dlm_header *hd, int nodeid) { if (hd->version[0] != our_protocol.daemon_run[0] || hd->version[1] != our_protocol.daemon_run[1]) { log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", nodeid, hd->version[0], hd->version[1], hd->version[2], our_protocol.daemon_run[0], our_protocol.daemon_run[1], our_protocol.daemon_run[2]); return -1; } if (hd->nodeid != nodeid) { log_error("bad msg nodeid %d %d", hd->nodeid, nodeid); return -1; } return 0; } static struct node_daemon *get_node_daemon(int nodeid) { struct node_daemon *node; list_for_each_entry(node, &daemon_nodes, list) { if (node->nodeid == nodeid) return node; } return NULL; } static int nodes_need_fencing(void) { struct node_daemon *node; list_for_each_entry(node, &daemon_nodes, list) { if (node->need_fencing) return 1; } return 0; } static int nodeid_needs_fencing(int nodeid) { struct node_daemon *node; node = get_node_daemon(nodeid); if (!node) { log_error("nodeid_needs_fencing %d not found", nodeid); return 0; } return node->need_fencing; } static int all_daemon_members_fipu(void) { struct node_daemon *node; list_for_each_entry(node, &daemon_nodes, list) { if (!node->daemon_member) continue; if (!node->fence_in_progress_unknown) return 0; } list_for_each_entry(node, &daemon_nodes, list) { if (!node->daemon_member) continue; node->fence_in_progress_unknown = 0; } return 1; } int fence_node_time(int nodeid, uint64_t *last_fenced) { struct node_daemon *node; node = get_node_daemon(nodeid); if (!node) return -1; *last_fenced = node->fence_monotime; return 0; } int fence_in_progress(int *in_progress) { if (fence_in_progress_unknown) { *in_progress = 1; } else if (!list_empty(&startup_nodes)) { *in_progress = 2; } else if (nodes_need_fencing()) { *in_progress = 3; } else { *in_progress = 0; } return 0; } void add_startup_node(int nodeid) { struct node_daemon *node; node = malloc(sizeof(struct node_daemon)); if (!node) { log_error("add_startup_node no mem"); return; } memset(node, 0, sizeof(struct node_daemon)); node->nodeid = nodeid; list_add_tail(&node->list, &startup_nodes); } static int clear_startup_node(int nodeid, int all) { struct node_daemon *node, *safe; int count = 0; list_for_each_entry_safe(node, safe, &startup_nodes, list) { if (all || node->nodeid == nodeid) { list_del(&node->list); free(node); count++; } } return count; } static struct node_daemon *add_node_daemon(int nodeid) { struct node_daemon *node; struct fence_config *fc; int rv; node = get_node_daemon(nodeid); if (node) return node; node = malloc(sizeof(struct node_daemon)); if (!node) { log_error("add_node_daemon no mem"); return NULL; } memset(node, 0, sizeof(struct node_daemon)); node->nodeid = nodeid; list_add_tail(&node->list, &daemon_nodes); /* TODO: allow the config to be reread */ fc = &node->fence_config; fc->nodeid = nodeid; /* explicit config file setting */ rv = fence_config_init(fc, (unsigned int)nodeid, (char *)CONF_FILE_PATH); if (!rv) goto out; /* no config file setting, so use default */ if (rv == -ENOENT) { fc->dev[0] = &fence_all_device; goto out; } log_error("fence config %d error %d", nodeid, rv); out: return node; } /* A clean daemon member is a node that has joined the daemon cpg from a "clean state", i.e. not a stateful merge. If would not have joined the daemon cpg if it found uncontrolled dlm kernel state (check_uncontrolled_lockspaces). We would not have accepted and saved its protocol in node->proto.daemon if it was a stateful merge. */ static int is_clean_daemon_member(int nodeid) { struct node_daemon *node; node = get_node_daemon(nodeid); if (node && node->daemon_member && node->proto.daemon_max[0]) return 1; return 0; } static int in_daemon_list(int nodeid, struct cpg_address *daemon_list, int count) { int i; for (i = 0; i < count; i++) { if (daemon_list[i].nodeid == nodeid) return 1; } return 0; } /* save in node->fence_actors[] any nodeid present when the node failed which therefore saw it fail, knows it needs fencing, and can request fencing for it if it becomes the low actor. A node added in the same change with the removed node does not qualify. */ static int set_fence_actors(struct node_daemon *node, int all_memb) { int i, nodeid, count = 0, low = 0; memset(node->fence_actors, 0, sizeof(node->fence_actors)); memset(node->fence_actors_orig, 0, sizeof(node->fence_actors_orig)); for (i = 0; i < daemon_member_count; i++) { nodeid = daemon_member[i].nodeid; if (!all_memb && in_daemon_list(nodeid, daemon_joined, daemon_joined_count)) continue; node->fence_actors[count++] = nodeid; if (!low || nodeid < low) low = nodeid; } /* keep a copy of the original set so they can be retried if all fail */ memcpy(node->fence_actors_orig, node->fence_actors, sizeof(node->fence_actors)); log_debug("set_fence_actors for %d low %d count %d", node->nodeid, low, count); return low; } static int get_fence_actor(struct node_daemon *node) { int i, low, low_i; retry: low = 0; for (i = 0; i < MAX_NODES; i++) { if (!node->fence_actors[i]) continue; if (!low || node->fence_actors[i] < low) { low = node->fence_actors[i]; low_i = i; } } if (low && !in_daemon_list(low, daemon_member, daemon_member_count)) { log_debug("get_fence_actor for %d low actor %d is gone", node->nodeid, low); node->fence_actors[low_i] = 0; goto retry; } node->fence_actor_last = low; return low; } /* if an actor fails to fence, it will send that result, and others will clear it from the actors, which will cause the next lowest actor to try */ static void clear_fence_actor(int nodeid, int actor) { struct node_daemon *node; int remaining = 0; int i; node = get_node_daemon(nodeid); if (!node) return; for (i = 0; i < MAX_NODES; i++) { if (node->fence_actors[i] == actor) node->fence_actors[i] = 0; else if (node->fence_actors[i]) remaining++; } if (!remaining && opt(repeat_failed_fencing_ind)) { log_debug("clear_fence_actor %d restoring original actors to retry", actor); memcpy(node->fence_actors, node->fence_actors_orig, sizeof(node->fence_actors)); } } static void clear_zombies(void) { int i, rv, result = 0; for (i = 0; i < MAX_ZOMBIES; i++) { if (!zombie_count) break; if (!zombie_pids[i]) continue; rv = fence_result(-1, zombie_pids[i], &result); if (rv == -EAGAIN) continue; log_debug("cleared zombie %d rv %d result %d", zombie_pids[i], rv, result); zombie_pids[i] = 0; zombie_count--; } } static void add_zombie(int pid) { int i; for (i = 0; i < MAX_ZOMBIES; i++) { if (!zombie_pids[i]) { zombie_pids[i] = pid; zombie_count++; return; } } } static void fence_pid_cancel(int nodeid, int pid) { int rv, result = 0; log_debug("fence_pid_cancel nodeid %d pid %d sigkill", nodeid, pid); kill(pid, SIGKILL); usleep(500000); rv = fence_result(nodeid, pid, &result); if (rv == -EAGAIN) add_zombie(pid); log_debug("fence_pid_cancel nodeid %d pid %d rv %d result %d", nodeid, pid, rv, result); } static void kick_stateful_merge_members(void) { struct node_daemon *node; list_for_each_entry(node, &daemon_nodes, list) { if (!node->killed && node->stateful_merge) { log_error("daemon node %d kill stateful merge member", node->nodeid); kick_node_from_cluster(node->nodeid); node->killed = 1; } } } /* * fence_in_progress_unknown (fipu) * * If current daemon members are fencing someone, and a new node * joins, that new node needs to wait for the previous members to * finish any fencing they're doing before it can start a lockspace. * * The previous members may be fencing the last node that was using * the lockspace the new node is going to use, so if it doesn't wait, * it could start using a lockspace with an unfenced user. * * So, the daemon starts with fence_in_progress_unknown set to * indicate that other nodes may be fencing someone, and it won't * start any lockspaces until it is clear. * * A node starts with fence_in_progress_unknown set and won't * start any lockspaces until it's clear. * * When using startup_fencing: * * . When all nodes start up together, all have fipu set, * and will go through startup fencing, which will eventually * result in all nodes either being clean daemon members or fenced, * so everyone will clear fipu by seeing that. * * . The more common case is when a new node joins other previously * running nodes. The new node needs to be told that the others * have no outstanding fencing ops before it can clear fipu. * A previous member does send_fence_clear(0) to a new node once * all fencing is complete. The two flags in send_fence_clear are * usually sent together but may sometimes may be in separate messages: * send_fence_clear(0, CLEAR_STARTUP) to clear startup_nodes right away * send_fence_clear(0, CLEAR_FIPU) to clear fipu once all fencing is done * * When not using startup_fencing: * * . When all nodes start up together, all have fipu set, and all * will be waiting to receive_fence_clear from a previous node * in order to clear it. The nodes need to detect this situation, * and when they do, they will know that everyone is in startup, * so there can be no pending fencing on a previous node, so all * can clear fipu. To detect this case, when a node starts up * with !startup_fence, it sends a special send_fence_clear(-ENODATA, FIPU) * message about itself to indicate it has fipu set and needs it cleared. * After sending this, it checks to see if all present nodes have sent * this same message about themselves. If so, then this startup * case has been detected, an all will clear fipu. * * . New nodes that join after this startup initialization will be * handled the same as when startup_fencing is set (above). * * * startup_fencing * --------------- * * case A * all nodes start up, * all have fipu set, * all wait for startup_nodes to be empty, (joined or moved to need_fencing) * all wait for no daemon_nodes to need_fencing, (joined or were fenced) * all clear fipu * * later, * * case B * new node starts, * new node has fipu set, * cur node sees need_fence_clear on new node * cur node sees no pending fencing ops, * cur node send_fence_clear(0) to new node, * new node clears startup_nodes and fipu * * !startup_fencing * ---------------- * * case C * all nodes start up, * all have fipu set, * all send_fence_clear(-ENODATA,FIPU), * all receive_fence_clear(-ENODATA,FIPU) from everyone, * all_daemon_members_fipu() is 1, * all clear fipu * * later same as case B above */ static void daemon_fence_work(void) { struct node_daemon *node, *safe; int gone_count = 0, part_count = 0, merge_count = 0, clean_count = 0; int rv, nodeid, pid, need, low = 0, actor, result; int retry = 0; uint32_t flags; if (!daemon_fence_allow) return; if (daemon_ringid_wait) { /* We've seen a nodedown confchg callback, but not the corresponding ringid callback. */ log_retry(retry_fencing, "fence work wait for cpg ringid"); retry = 1; goto out; } if (cluster_ringid_seq != daemon_ringid.seq) { /* wait for ringids to be in sync */ log_retry(retry_fencing, "fence work wait for cluster ringid"); retry = 1; goto out; } if (opt(enable_quorum_fencing_ind) && !cluster_quorate) { /* wait for quorum before doing any fencing, but if there is none, send_fence_clear below can unblock new nodes */ log_retry(retry_fencing, "fence work wait for quorum"); retry = 1; goto out_fipu; } /* * Count different types of nodes * gone: node not a member * part: member we've not received a proto message from * merge: member we received a stateful proto message from * clean: member we received a clean/new proto message from * * A node always views itself as a clean member, not a merge member. */ list_for_each_entry(node, &daemon_nodes, list) { if (!node->daemon_member) { gone_count++; } else { if (!low || node->nodeid < low) low = node->nodeid; if (node->stateful_merge) merge_count++; else if (!node->proto.daemon_max[0]) part_count++; else clean_count++; } } /* * Wait for stateful merged members to be removed before moving * on to fencing. Kill stateful merged members to clear them. * This section is only relevant to non-two-node, even splits. * * With two node splits, they race to fence each other and * whichever fences successfully then kills corosync on the other * (in the case where corosync is still running on the fenced node). * * With an odd split, the partition that maintained quorum will * kill stateful merged nodes when their proto message is received. * * With an even split, e.g. 2/2, we don't want both sets to * be fencing each other right after merge, when both sides * have quorum again and see the other side as statefully merged. * So, delay fencing until the stateful nodes are cleared on one * side (by way of the low nodeid killing stateful merged members). * * When there are 3 or more partitions that merge, none may see * enough clean nodes, so the cluster would be stuck here waiting * for someone to manually reset/restart enough nodes to produce * sufficient clean nodes (>= merged). */ if (!cluster_two_node && merge_count) { log_retry(retry_fencing, "fence work wait to clear merge %d clean %d part %d gone %d", merge_count, clean_count, part_count, gone_count); if ((clean_count >= merge_count) && !part_count && (low == our_nodeid)) kick_stateful_merge_members(); if ((clean_count < merge_count) && !part_count) stateful_merge_wait = 1; retry = 1; goto out; } if (stateful_merge_wait) stateful_merge_wait = 0; /* * startup fencing */ list_for_each_entry_safe(node, safe, &startup_nodes, list) { if (is_clean_daemon_member(node->nodeid)) { log_debug("fence startup %d skip member", node->nodeid); list_del(&node->list); free(node); continue; } if (!opt(enable_startup_fencing_ind)) continue; if (!fence_delay_begin) { log_debug("fence startup %d wait for initial delay", node->nodeid); continue; } if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) { log_debug("fence startup %d delay %d from %llu", node->nodeid, opt(post_join_delay_ind), (unsigned long long)fence_delay_begin); retry = 1; continue; } /* clear this entry and create a daemon_nodes entry with need_fencing and the fence loops below will handle it */ nodeid = node->nodeid; list_del(&node->list); free(node); node = add_node_daemon(nodeid); if (!node) { log_debug("fence startup %d add failed", nodeid); continue; } if (node->need_fencing) { /* don't think this should happen? */ log_error("fence startup %d already set", nodeid); continue; } node->need_fencing = 1; node->delay_fencing = 0; node->fence_monotime = 0; node->fence_walltime = 0; node->fence_actor_last = 0; node->fence_actor_done = 0; node->fence_pid_wait = 0; node->fence_pid = 0; node->fence_result_wait = 0; node->fence_config.pos = 0; node->left_reason = REASON_STARTUP_FENCING; node->fail_monotime = cluster_joined_monotime - 1; node->fail_walltime = cluster_joined_walltime - 1; low = set_fence_actors(node, 1); log_debug("fence startup nodeid %d act %d", node->nodeid, low); } /* * request fencing */ list_for_each_entry(node, &daemon_nodes, list) { if (!node->need_fencing) continue; if (node->fence_pid_wait) continue; if (node->fence_result_wait) { log_debug("fence request %d result_wait", node->nodeid); continue; } if (is_clean_daemon_member(node->nodeid)) { /* * node has rejoined in clean state */ log_debug("fence request %d skip for is_clean_daemon_member", node->nodeid); node->need_fencing = 0; node->delay_fencing = 0; node->fence_walltime = time(NULL); node->fence_monotime = monotime(); node->fence_actor_done = node->nodeid; continue; } if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) { /* run one agent at a time in case they need the same switch */ log_retry(retry_fencing, "fence request %d delay for other pid %d", node->nodeid, daemon_fence_pid); node->delay_fencing = 1; retry = 1; continue; } /* use post_join_delay to avoid fencing a node in the short time between it joining the cluster (giving cluster quorum) and joining the daemon cpg, which allows it to bypass fencing */ if (monotime() - fence_delay_begin < opt(post_join_delay_ind)) { log_debug("fence request %d delay %d from %llu", node->nodeid, opt(post_join_delay_ind), (unsigned long long)fence_delay_begin); node->delay_fencing = 1; retry = 1; continue; } node->delay_fencing = 0; /* get_fence_actor picks the low nodeid that existed when node failed and is still around. if the current actor fails, get_fence_actor will not find it in the members list, will clear it, and return the next actor */ actor = get_fence_actor(node); if (!actor) { log_error("fence request %d no actor", node->nodeid); continue; } if (actor != our_nodeid) { log_debug("fence request %d defer to %d", node->nodeid, actor); continue; } log_debug("fence request %d pos %d", node->nodeid, node->fence_config.pos); rv = fence_request(node->nodeid, node->fail_walltime, node->fail_monotime, &node->fence_config, node->left_reason, &pid); if (rv < 0) { send_fence_result(node->nodeid, rv, 0, time(NULL)); node->fence_result_wait = 1; continue; } node->fence_pid_wait = 1; node->fence_pid = pid; daemon_fence_pid = pid; } /* * check outstanding fence requests */ list_for_each_entry(node, &daemon_nodes, list) { if (!node->need_fencing) continue; if (node->delay_fencing) continue; if (node->fence_result_wait) { log_debug("fence wait %d result_wait", node->nodeid); continue; } if (!node->fence_pid_wait) { /* another node is the actor */ log_debug("fence wait %d for done", node->nodeid); continue; } if (!node->fence_pid) { /* shouldn't happen */ log_error("fence wait %d zero pid", node->nodeid); node->fence_pid_wait = 0; continue; } nodeid = node->nodeid; pid = node->fence_pid; if (is_clean_daemon_member(nodeid)) { /* * node has rejoined in clean state so we can * abort outstanding fence op for it. all nodes * will see and do this, so we don't need to send * a fence result. */ log_debug("fence wait %d pid %d skip for is_clean_daemon_member", nodeid, pid); node->need_fencing = 0; node->delay_fencing = 0; node->fence_walltime = time(NULL); node->fence_monotime = monotime(); node->fence_actor_done = nodeid; node->fence_pid_wait = 0; node->fence_pid = 0; daemon_fence_pid = 0; fence_pid_cancel(nodeid, pid); continue; } retry = 1; rv = fence_result(nodeid, pid, &result); if (rv == -EAGAIN) { /* agent pid is still running */ if (fence_result_pid != pid) { fence_result_try = 0; fence_result_pid = pid; } fence_result_try++; log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid); continue; } node->fence_pid_wait = 0; node->fence_pid = 0; daemon_fence_pid = 0; if (rv < 0) { /* shouldn't happen */ log_error("fence wait %d pid %d error %d", nodeid, pid, rv); continue; } log_debug("fence wait %d pid %d result %d", nodeid, pid, result); if (!result) { /* agent exit 0, if there's another agent to run in parallel, set it to run next, otherwise success */ rv = fence_config_next_parallel(&node->fence_config); if (rv < 0) { send_fence_result(nodeid, 0, 0, time(NULL)); node->fence_result_wait = 1; } } else { /* agent exit 1, if there's another agent to run at next priority, set it to run next, otherwise fail */ rv = fence_config_next_priority(&node->fence_config); if (rv < 0) { send_fence_result(nodeid, result, 0, time(NULL)); node->fence_result_wait = 1; } } } /* * clear fence_in_progress_unknown */ out_fipu: if (opt(enable_startup_fencing_ind) && fence_in_progress_unknown && list_empty(&startup_nodes) && !wait_clear_fipu && !nodes_need_fencing()) { /* * case A in comment above * all nodes are starting and have fipu set, they all do * startup fencing, and eventually see unknown nodes become * members or get fenced, so all clear fipu for themselves. */ fence_in_progress_unknown = 0; log_debug("fence_in_progress_unknown 0 startup"); } if (!fence_in_progress_unknown) { /* * case B in comment above * some cur nodes have fipu clear, new nodes have fipu set. * A current node needs to send_fence_clear to the new nodes * once all fencing is done so they clear fipu. */ low = 0; need = 0; list_for_each_entry(node, &daemon_nodes, list) { if (node->need_fencing) need++; if (!node->daemon_member || node->need_fence_clear) continue; if (!low || node->nodeid < low) low = node->nodeid; } list_for_each_entry(node, &daemon_nodes, list) { if (!node->daemon_member || !node->need_fence_clear) continue; if (node->nodeid == our_nodeid) { node->need_fence_clear = 0; continue; } if (low != our_nodeid) continue; flags = 0; if (node->need_fence_clear & FR_CLEAR_STARTUP) { flags |= FR_CLEAR_STARTUP; node->need_fence_clear &= ~FR_CLEAR_STARTUP; } if ((node->need_fence_clear & FR_CLEAR_FIPU) && !need) { flags |= FR_CLEAR_FIPU; node->need_fence_clear &= ~FR_CLEAR_FIPU; } if (!flags) continue; send_fence_clear(node->nodeid, 0, flags, 0); } } if (!opt(enable_startup_fencing_ind) && fence_in_progress_unknown) { /* * case C in comment above * all nodes are starting and have fipu set. All expect a * previous node to send_fence_clear so they can clear fipu. * But there are no previous nodes. They need to detect this * condition. Each node does send_fence_clear(ENODATA,FIPU). * When all have received this from all, condition is * detected and all clear fipu. */ if (all_daemon_members_fipu()) { fence_in_progress_unknown = 0; log_debug("fence_in_progress_unknown 0 all_fipu"); } else if (last_join_seq > send_fipu_seq) { /* the seq numbers keep us from spamming this msg */ send_fence_clear(our_nodeid, -ENODATA, FR_FIPU, 0); log_debug("send_fence_clear %d fipu", our_nodeid); send_fipu_seq = last_join_seq; } } /* * clean up a zombie pid from an agent we killed */ if (zombie_count) clear_zombies(); /* * setting retry_fencing will cause the main daemon poll loop * to timeout in 1 second and call this function again. */ out: if (retry) retry_fencing++; else retry_fencing = 0; } void process_fencing_changes(void) { daemon_fence_work(); } static void receive_fence_clear(struct dlm_header *hd, int len) { struct fence_result *fr; struct node_daemon *node; int count; fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); fr->flags = le32_to_cpu(fr->flags); fr->nodeid = le32_to_cpu(fr->nodeid); fr->result = le32_to_cpu(fr->result); fr->fence_walltime = le64_to_cpu(fr->fence_walltime); if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { log_error("receive_fence_clear invalid len %d from %d", len, hd->nodeid); return; } node = get_node_daemon(fr->nodeid); if (!node) { log_error("receive_fence_clear from %d no daemon node %d", hd->nodeid, fr->nodeid); return; } log_debug("receive_fence_clear from %d for %d result %d flags %x", hd->nodeid, fr->nodeid, fr->result, fr->flags); /* * A node sends this message about itself indicating that it's in * startup with fipu set. The only time we care about node->fipu * is when all nodes are fipu in startup. node->need_fence_clear * and node->fipu are not related, they address different cases. */ if ((fr->result == -ENODATA) && (fr->flags & FR_FIPU)) { if (!fence_in_progress_unknown) return; node->fence_in_progress_unknown = 1; return; } /* * An previous member sends this to new members to tell them that * they can clear startup_nodes and clear fipu. These two flags * may come in separate messages if there is a pending fencing op * when the new member joins (CLEAR_STARTUP will come right away, * but CLEAR_FIPU will come once the fencing op is done.) * * We need wait_clear_fipu after emptying startup_nodes to avoid * thinking we've finished startup fencing in case A below, and * clearing fipu ourselves. */ if (!fr->result && (node->nodeid == our_nodeid)) { if ((fr->flags & FR_CLEAR_STARTUP) && !list_empty(&startup_nodes)) { count = clear_startup_node(0, 1); log_debug("clear_startup_nodes %d", count); wait_clear_fipu = 1; } if ((fr->flags & FR_CLEAR_FIPU) && fence_in_progress_unknown) { fence_in_progress_unknown = 0; log_debug("fence_in_progress_unknown 0 recv"); wait_clear_fipu = 0; } } /* this node doesn't need these flags any more */ if (!fr->result) { if (fr->flags & FR_CLEAR_STARTUP) node->need_fence_clear &= ~FR_CLEAR_STARTUP; if (fr->flags & FR_CLEAR_FIPU) node->need_fence_clear &= ~FR_CLEAR_FIPU; } } static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime) { struct dlm_header *hd; struct fence_result *fr; char *buf; int len; len = sizeof(struct dlm_header) + sizeof(struct fence_result); buf = malloc(len); if (!buf) { log_error("send_fence_clear no mem %d", len); return; } memset(buf, 0, len); hd = (struct dlm_header *)buf; fr = (struct fence_result *)(buf + sizeof(*hd)); hd->type = cpu_to_le16(DLM_MSG_FENCE_CLEAR); hd->nodeid = cpu_to_le32(our_nodeid); fr->flags = cpu_to_le32(flags); fr->nodeid = cpu_to_le32(nodeid); fr->result = cpu_to_le32(result); fr->fence_walltime = cpu_to_le64(walltime); _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_CLEAR); } static void receive_fence_result(struct dlm_header *hd, int len) { struct fence_result *fr; struct node_daemon *node; uint64_t now; int count; fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); fr->flags = le32_to_cpu(fr->flags); fr->nodeid = le32_to_cpu(fr->nodeid); fr->result = le32_to_cpu(fr->result); fr->fence_walltime = le64_to_cpu(fr->fence_walltime); if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { log_error("receive_fence_result invalid len %d from %d", len, hd->nodeid); return; } count = clear_startup_node(fr->nodeid, 0); if (count) { log_debug("receive_fence_result %d from %d clear startup", fr->nodeid, hd->nodeid); return; } node = get_node_daemon(fr->nodeid); if (!node) { log_error("receive_fence_result %d from %d result %d no daemon node", fr->nodeid, hd->nodeid, fr->result); return; } if (!node->need_fencing) { /* should never happen ... will happen if a manual fence_ack is done for a node that doesn't need it */ log_error("receive_fence_result %d from %d result %d no need_fencing", fr->nodeid, hd->nodeid, fr->result); return; } if ((hd->nodeid == our_nodeid) && !node->fence_result_wait && (fr->result != -ECANCELED)) { /* should never happen */ log_error("receive_fence_result %d from %d result %d no fence_result_wait", fr->nodeid, hd->nodeid, fr->result); /* should we ignore and return here? */ } if (node->daemon_member && (!fr->result || (fr->result == -ECANCELED))) { /* * The node was successfully fenced, but is still a member. * This will happen when there is a partition, storage fencing * is started, a merge causes the node to become a member * again, and storage fencing completes successfully. If we * received a proto message from the node after the merge, then * we will have detected a stateful merge, and we may have * already killed it. */ log_error("receive_fence_result %d from %d result %d node is daemon_member", fr->nodeid, hd->nodeid, fr->result); kick_node_from_cluster(fr->nodeid); } if ((hd->nodeid == our_nodeid) && (fr->result != -ECANCELED)) node->fence_result_wait = 0; now = monotime(); log_error("fence status %d receive %d from %d walltime %llu local %llu", fr->nodeid, fr->result, hd->nodeid, (unsigned long long)fr->fence_walltime, (unsigned long long)now); if (!fr->result || (fr->result == -ECANCELED)) { node->need_fencing = 0; node->delay_fencing = 0; node->fence_walltime = fr->fence_walltime; node->fence_monotime = now; node->fence_actor_done = hd->nodeid; } else { /* causes the next lowest nodeid to request fencing */ clear_fence_actor(fr->nodeid, hd->nodeid); } if ((fr->result == -ECANCELED) && node->fence_pid_wait && node->fence_pid) { fence_pid_cancel(node->nodeid, node->fence_pid); node->fence_pid_wait = 0; node->fence_pid = 0; daemon_fence_pid = 0; } } static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime) { struct dlm_header *hd; struct fence_result *fr; char *buf; int len; len = sizeof(struct dlm_header) + sizeof(struct fence_result); buf = malloc(len); if (!buf) { log_error("send_fence_result no mem %d", len); return; } memset(buf, 0, len); hd = (struct dlm_header *)buf; fr = (struct fence_result *)(buf + sizeof(*hd)); hd->type = cpu_to_le16(DLM_MSG_FENCE_RESULT); hd->nodeid = cpu_to_le32(our_nodeid); fr->flags = cpu_to_le32(flags); fr->nodeid = cpu_to_le32(nodeid); fr->result = cpu_to_le32(result); fr->fence_walltime = cpu_to_le64(walltime); _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_RESULT); } void fence_ack_node(int nodeid) { send_fence_result(nodeid, -ECANCELED, 0, time(NULL)); } void set_protocol_stateful(void) { our_protocol.dr_ver.flags |= PV_STATEFUL; } static void pv_in(struct protocol_version *pv) { pv->major = le16_to_cpu(pv->major); pv->minor = le16_to_cpu(pv->minor); pv->patch = le16_to_cpu(pv->patch); pv->flags = le16_to_cpu(pv->flags); } static void pv_out(struct protocol_version *pv) { pv->major = cpu_to_le16(pv->major); pv->minor = cpu_to_le16(pv->minor); pv->patch = cpu_to_le16(pv->patch); pv->flags = cpu_to_le16(pv->flags); } static void protocol_in(struct protocol *proto) { pv_in(&proto->dm_ver); pv_in(&proto->km_ver); pv_in(&proto->dr_ver); pv_in(&proto->kr_ver); } static void protocol_out(struct protocol *proto) { pv_out(&proto->dm_ver); pv_out(&proto->km_ver); pv_out(&proto->dr_ver); pv_out(&proto->kr_ver); } /* go through member list saved in last confchg, see if we have received a proto message from each */ static int all_protocol_messages(void) { struct node_daemon *node; int i; if (!daemon_member_count) return 0; for (i = 0; i < daemon_member_count; i++) { node = get_node_daemon(daemon_member[i].nodeid); if (!node) { log_error("all_protocol_messages no node %d", daemon_member[i].nodeid); return 0; } if (!node->proto.daemon_max[0]) return 0; } return 1; } static int pick_min_protocol(struct protocol *proto) { uint16_t mind[4]; uint16_t mink[4]; struct node_daemon *node; int i; memset(&mind, 0, sizeof(mind)); memset(&mink, 0, sizeof(mink)); /* first choose the minimum major */ for (i = 0; i < daemon_member_count; i++) { node = get_node_daemon(daemon_member[i].nodeid); if (!node) { log_error("pick_min_protocol no node %d", daemon_member[i].nodeid); return -1; } if (!mind[0] || node->proto.daemon_max[0] < mind[0]) mind[0] = node->proto.daemon_max[0]; if (!mink[0] || node->proto.kernel_max[0] < mink[0]) mink[0] = node->proto.kernel_max[0]; } if (!mind[0] || !mink[0]) { log_error("pick_min_protocol zero major number"); return -1; } /* second pick the minimum minor with the chosen major */ for (i = 0; i < daemon_member_count; i++) { node = get_node_daemon(daemon_member[i].nodeid); if (!node) continue; if (mind[0] == node->proto.daemon_max[0]) { if (!mind[1] || node->proto.daemon_max[1] < mind[1]) mind[1] = node->proto.daemon_max[1]; } if (mink[0] == node->proto.kernel_max[0]) { if (!mink[1] || node->proto.kernel_max[1] < mink[1]) mink[1] = node->proto.kernel_max[1]; } } if (!mind[1] || !mink[1]) { log_error("pick_min_protocol zero minor number"); return -1; } /* third pick the minimum patch with the chosen major.minor */ for (i = 0; i < daemon_member_count; i++) { node = get_node_daemon(daemon_member[i].nodeid); if (!node) continue; if (mind[0] == node->proto.daemon_max[0] && mind[1] == node->proto.daemon_max[1]) { if (!mind[2] || node->proto.daemon_max[2] < mind[2]) mind[2] = node->proto.daemon_max[2]; } if (mink[0] == node->proto.kernel_max[0] && mink[1] == node->proto.kernel_max[1]) { if (!mink[2] || node->proto.kernel_max[2] < mink[2]) mink[2] = node->proto.kernel_max[2]; } } if (!mind[2] || !mink[2]) { log_error("pick_min_protocol zero patch number"); return -1; } memcpy(&proto->daemon_run, &mind, sizeof(mind)); memcpy(&proto->kernel_run, &mink, sizeof(mink)); return 0; } static void receive_protocol(struct dlm_header *hd, int len) { struct protocol *p; struct node_daemon *node; int new = 0; p = (struct protocol *)((char *)hd + sizeof(struct dlm_header)); protocol_in(p); if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) { log_error("receive_protocol invalid len %d from %d", len, hd->nodeid); return; } /* zero is an invalid version value */ if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] || !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) { log_error("receive_protocol invalid max value from %d " "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]); return; } /* the run values will be zero until a version is set, after which none of the run values can be zero */ if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] || !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) { log_error("receive_protocol invalid run value from %d " "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]); return; } /* save this node's proto so we can tell when we've got all, and use it to select a minimum protocol from all */ node = get_node_daemon(hd->nodeid); if (!node) { log_error("receive_protocol no node %d", hd->nodeid); return; } if (!node->daemon_member) { log_error("receive_protocol node %d not member", hd->nodeid); return; } log_debug("receive_protocol %d max %u.%u.%u.%x run %u.%u.%u.%x", hd->nodeid, p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], p->daemon_max[3], p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], p->daemon_run[3]); if (memcmp(&node->proto, p, sizeof(struct protocol))) { log_debug("daemon node %d prot max %u.%u.%u.%x run %u.%u.%u.%x", hd->nodeid, node->proto.daemon_max[0], node->proto.daemon_max[1], node->proto.daemon_max[2], node->proto.daemon_max[3], node->proto.daemon_run[0], node->proto.daemon_run[1], node->proto.daemon_run[2], node->proto.daemon_run[3]); new = 1; } /* checking zero node->daemon_max[0] is a way to tell if we've received an acceptable (non-stateful) proto message from the node since we saw it join the daemon cpg */ if (hd->nodeid != our_nodeid && !node->proto.daemon_max[0] && (p->dr_ver.flags & PV_STATEFUL) && (our_protocol.dr_ver.flags & PV_STATEFUL)) { log_error("daemon node %d stateful merge", hd->nodeid); log_debug("daemon node %d join %llu left %llu local quorum %llu killed %d", hd->nodeid, (unsigned long long)node->daemon_add_time, (unsigned long long)node->daemon_rem_time, (unsigned long long)cluster_quorate_monotime, node->killed); node->stateful_merge = 1; if (cluster_quorate && node->daemon_rem_time && cluster_quorate_monotime < node->daemon_rem_time) { if (!node->killed) { if (cluster_two_node) { /* * When there are two nodes and two_node mode * is used, both will have quorum throughout * the partition and subsequent stateful merge. * * - both will race to fence each other in * response to the partition * * - both can attempt to kill the cluster * on the other in response to the stateful * merge here * * - we don't want both nodes to kill the cluster * on the other, which can happen if the merge * occurs before power fencing is successful, * or can happen before/during/after storage * fencing * * - if nodeA successfully fences nodeB (due * to the partition), we want nodeA to kill * the cluster on nodeB in response to the * merge (we don't want nodeB to kill nodeA * in response to the merge). * * So, a node that has successfully fenced the * other will kill the cluster on it. If fencing * is still running, we wait until it's * successfull to kill the cluster on the node * being fenced. */ if (nodeid_needs_fencing(hd->nodeid)) { /* when fencing completes successfully, we'll see the node is a daemon member and kill it */ log_debug("daemon node %d delay kill for stateful merge", hd->nodeid); } else { log_error("daemon node %d kill due to stateful merge", hd->nodeid); kick_node_from_cluster(hd->nodeid); } } else { log_error("daemon node %d kill due to stateful merge", hd->nodeid); kick_node_from_cluster(hd->nodeid); } } node->killed = 1; } /* don't save p->proto into node->proto; we need to come through here based on zero daemon_max[0] for other proto messages like this one from the same node */ return; } if (new) { memcpy(&node->proto, p, sizeof(struct protocol)); log_debug("daemon node %d save max %u.%u.%u.%x run %u.%u.%u.%x", node->nodeid, node->proto.daemon_max[0], node->proto.daemon_max[1], node->proto.daemon_max[2], node->proto.daemon_max[3], node->proto.daemon_run[0], node->proto.daemon_run[1], node->proto.daemon_run[2], node->proto.daemon_run[3]); } /* if we have zero run values, and this msg has non-zero run values, then adopt them as ours; otherwise save this proto message */ if (our_protocol.daemon_run[0]) return; if (p->daemon_run[0]) { our_protocol.daemon_run[0] = p->daemon_run[0]; our_protocol.daemon_run[1] = p->daemon_run[1]; our_protocol.daemon_run[2] = p->daemon_run[2]; our_protocol.kernel_run[0] = p->kernel_run[0]; our_protocol.kernel_run[1] = p->kernel_run[1]; our_protocol.kernel_run[2] = p->kernel_run[2]; log_debug("run protocol from nodeid %d", hd->nodeid); } } static void send_protocol(struct protocol *proto) { struct dlm_header *hd; struct protocol *pr; char *buf; int len; len = sizeof(struct dlm_header) + sizeof(struct protocol); buf = malloc(len); if (!buf) { log_error("send_protocol no mem %d", len); return; } memset(buf, 0, len); hd = (struct dlm_header *)buf; pr = (struct protocol *)(buf + sizeof(*hd)); hd->type = cpu_to_le16(DLM_MSG_PROTOCOL); hd->nodeid = cpu_to_le32(our_nodeid); memcpy(pr, proto, sizeof(struct protocol)); protocol_out(pr); _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL); free(buf); } int set_protocol(void) { struct protocol proto; struct pollfd pollfd; cs_error_t error; int sent_proposal = 0; int rv; memset(&pollfd, 0, sizeof(pollfd)); pollfd.fd = cpg_fd_daemon; pollfd.events = POLLIN; while (1) { if (our_protocol.daemon_run[0]) break; if (!sent_proposal && all_protocol_messages()) { /* propose a protocol; look through info from all nodes and pick the min for both daemon and kernel, and propose that */ sent_proposal = 1; /* copy our max values */ memcpy(&proto, &our_protocol, sizeof(struct protocol)); rv = pick_min_protocol(&proto); if (rv < 0) return rv; log_debug("set_protocol member_count %d propose " "daemon %u.%u.%u kernel %u.%u.%u", daemon_member_count, proto.daemon_run[0], proto.daemon_run[1], proto.daemon_run[2], proto.kernel_run[0], proto.kernel_run[1], proto.kernel_run[2]); send_protocol(&proto); } /* only process messages/events from daemon cpg until protocol is established */ rv = poll(&pollfd, 1, -1); if (rv == -1 && errno == EINTR) { if (daemon_quit) return -1; continue; } if (rv < 0) { log_error("set_protocol poll errno %d", errno); return -1; } if (pollfd.revents & POLLIN) { /* * don't use process_cpg_daemon() because we only want to * dispatch one thing at a time because we only want to * handling protocol related things here. */ error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ONE); if (error != CS_OK && error != CS_ERR_BAD_HANDLE) log_error("daemon cpg_dispatch one error %d", error); } if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { log_error("set_protocol poll revents %u", pollfd.revents); return -1; } } if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] || our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) { log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u", our_protocol.daemon_run[0], our_protocol.daemon_run[1], our_protocol.daemon_run[2], our_protocol.daemon_max[0], our_protocol.daemon_max[1], our_protocol.daemon_max[2]); return -1; } if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] || our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) { log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u", our_protocol.kernel_run[0], our_protocol.kernel_run[1], our_protocol.kernel_run[2], our_protocol.kernel_max[0], our_protocol.kernel_max[1], our_protocol.kernel_max[2]); return -1; } log_debug("daemon run %u.%u.%u max %u.%u.%u " "kernel run %u.%u.%u max %u.%u.%u", our_protocol.daemon_run[0], our_protocol.daemon_run[1], our_protocol.daemon_run[2], our_protocol.daemon_max[0], our_protocol.daemon_max[1], our_protocol.daemon_max[2], our_protocol.kernel_run[0], our_protocol.kernel_run[1], our_protocol.kernel_run[2], our_protocol.kernel_max[0], our_protocol.kernel_max[1], our_protocol.kernel_max[2]); send_protocol(&our_protocol); return 0; } static void deliver_cb_daemon(cpg_handle_t handle, const struct cpg_name *group_name, uint32_t nodeid, uint32_t pid, void *data, size_t len) { struct dlm_header *hd; if (len < sizeof(*hd)) { log_error("deliver_cb short message %zd", len); return; } hd = (struct dlm_header *)data; dlm_header_in(hd); if (!daemon_fence_allow && hd->type != DLM_MSG_PROTOCOL) { /* don't think this will happen; if it does we may need to verify that it's correct to ignore these messages instead of saving them to process after allow is set */ log_debug("deliver_cb_daemon ignore non proto msg %d", hd->type); return; } switch (hd->type) { case DLM_MSG_PROTOCOL: receive_protocol(hd, len); break; case DLM_MSG_FENCE_RESULT: receive_fence_result(hd, len); break; case DLM_MSG_FENCE_CLEAR: receive_fence_clear(hd, len); break; case DLM_MSG_RUN_REQUEST: receive_run_request(hd, len); break; case DLM_MSG_RUN_REPLY: receive_run_reply(hd, len); break; default: log_error("deliver_cb_daemon unknown msg type %d", hd->type); } daemon_fence_work(); } int receive_run_reply(struct dlm_header *hd, int len) { struct run_reply *rep = (struct run_reply *)hd; struct run *run; int i; run_reply_in(rep); log_debug("receive_run_reply %s from %d result %d", rep->uuid, hd->nodeid, rep->info.local_result); if (!opt(enable_helper_ind)) { log_debug("receive_run_reply %s helper not enabled", rep->uuid); return 0; } run = find_run(rep->uuid); if (!run) { log_debug("receive_run_reply no uuid %s", rep->uuid); return 0; } /* * Only the starting node keeps track of results. */ if (run->info.start_nodeid != our_nodeid) return 0; if (len != sizeof(struct run_reply)) { log_debug("receive_run_reply %s bad len %d expect %zu", rep->uuid, len, sizeof(struct run_reply)); run->info.reply_count++; run->info.need_replies--; return 0; } for (i = 0; i < run->node_count; i++) { if (run->node_results[i].nodeid != hd->nodeid) continue; /* shouldn't happen? */ if (run->node_results[i].replied) break; run->node_results[i].result = rep->info.local_result; run->node_results[i].replied = 1; if (rep->info.local_result) run->info.fail_count++; run->info.reply_count++; run->info.need_replies--; /* log_debug("run reply_count % need_replies %d fail_count %d", run->info.reply_count, run->info.need_replies, run->info.fail_count); */ break; } return 0; } int receive_run_request(struct dlm_header *hd, int len) { struct run_request *req = (struct run_request *)hd; struct run *run = NULL; run_request_in(req); log_debug("receive_run_request %s from %d", req->uuid, hd->nodeid); if (len != sizeof(struct run_request)) { log_debug("receive_run_request %s bad len %d", req->uuid, len); /* todo: send reply with failed */ return 0; } if (req->info.dest_nodeid && (req->info.dest_nodeid != our_nodeid)) return 0; if (req->info.start_nodeid == our_nodeid) { if (!(req->info.flags & DLMC_FLAG_RUN_START_NODE_RECV)) { log_debug("receive_run_request ignore self"); return 0; } if (!opt(enable_helper_ind)) { log_debug("receive_run_request %s helper not enabled", req->uuid); return 0; } run = find_run(req->uuid); if (!run) { log_debug("receive_run_request from self no uuid %s", req->uuid); return 0; } log_debug("receive_run_request %s to helper", req->uuid); send_helper_run_request(req); return 0; } if (!opt(enable_helper_ind) && run) { log_debug("receive_run_request %s helper not enabled", req->uuid); run->info.reply_count++; run->info.need_replies--; /* todo: send reply with failed */ return 0; } if (!(run = malloc(sizeof(struct run)))) { log_error("receive_run_request %s no mem", req->uuid); /* todo: send reply with failed */ return 0; } memset(run, 0, sizeof(struct run)); memcpy(run->uuid, req->uuid, RUN_UUID_LEN); memcpy(run->command, req->command, RUN_COMMAND_LEN); run->info.start_nodeid = req->info.start_nodeid; run->info.dest_nodeid = req->info.dest_nodeid; run->info.flags = req->info.flags; list_add(&run->list, &run_ops); log_error("run request %s %.128s", run->uuid, run->command); log_debug("receive_run_request %s to helper", req->uuid); send_helper_run_request(req); /* todo: if no helper, send reply with failed */ return 0; } int send_run_request(struct run *run, struct run_request *req) { struct node_daemon *node; int i = 0; int rv; list_for_each_entry(node, &daemon_nodes, list) { if (!node->daemon_member) continue; /* * When this starting node does not run the command, * there is no reply for our nodeid. */ if ((node->nodeid == our_nodeid) && (run->info.flags & DLMC_FLAG_RUN_START_NODE_NONE)) continue; /* * The command is only run on one specific node, and * only a reply from that node is needed. */ if (run->info.dest_nodeid && (node->nodeid != run->info.dest_nodeid)) continue; run->node_count++; run->node_results[i].nodeid = node->nodeid; i++; } run->info.need_replies = run->node_count; log_debug("send_run_request %s for %d nodes", req->uuid, run->node_count); run_request_out(req); rv = dlm_send_message_daemon((char *)req, sizeof(struct run_request)); return rv; } int send_run_reply(struct run *run, struct run_reply *rep) { int rv; log_debug("send_run_reply %s result %d", rep->uuid, rep->info.local_result); run_reply_out(rep); rv = dlm_send_message_daemon((char *)rep, sizeof(struct run_reply)); /* * If we are not the starting node, clear the run operation. */ if (rep->info.start_nodeid != our_nodeid) clear_run(run); return rv; } static void confchg_cb_daemon(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { struct node_daemon *node; uint64_t now, now_wall; int nodedown = 0, procdown = 0, leave = 0; int check_joined_count = 0, check_remove_count = 0, check_member_count = 0; int we_joined = 0; int i, reason, low; now = monotime(); now_wall = time(NULL); log_config(group_name, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries); memset(&daemon_member, 0, sizeof(daemon_member)); daemon_member_count = member_list_entries; for (i = 0; i < member_list_entries; i++) { daemon_member[i] = member_list[i]; /* add struct for nodes we've not seen before */ add_node_daemon(member_list[i].nodeid); } memset(&daemon_joined, 0, sizeof(daemon_joined)); daemon_joined_count = joined_list_entries; for (i = 0; i < joined_list_entries; i++) { daemon_joined[i] = joined_list[i]; if (joined_list[i].nodeid == our_nodeid) we_joined = 1; } memset(&daemon_remove, 0, sizeof(daemon_remove)); daemon_remove_count = left_list_entries; for (i = 0; i < left_list_entries; i++) { daemon_remove[i] = left_list[i]; if (left_list[i].reason == CPG_REASON_NODEDOWN) nodedown++; else if (left_list[i].reason == CPG_REASON_PROCDOWN) procdown++; else if (left_list[i].reason == CPG_REASON_LEAVE) leave++; } if (nodedown || procdown || leave) log_debug("%s left reason nodedown %d procdown %d leave %d", group_name->value, nodedown, procdown, leave); if (nodedown) daemon_ringid_wait = 1; if (joined_list_entries) send_protocol(&our_protocol); list_for_each_entry(node, &daemon_nodes, list) { if (in_daemon_list(node->nodeid, daemon_member, daemon_member_count)) { if (node->daemon_member) continue; check_joined_count++; /* node joined daemon cpg */ node->daemon_member = 1; node->daemon_add_time = now; fence_delay_begin = now; last_join_seq++; /* a joining node shows prev members in joined list */ if (!we_joined) node->need_fence_clear = FR_CLEAR_STARTUP|FR_CLEAR_FIPU; if (node->need_fencing) { /* need_fencing will be cleared if we accept a valid proto from it (is_clean_daemon_member) */ log_error("daemon joined %d needs fencing", node->nodeid); } else { log_debug("daemon joined %d", node->nodeid); } } else { if (!node->daemon_member) continue; check_remove_count++; /* node left daemon cpg */ node->daemon_member = 0; node->daemon_rem_time = now; node->killed = 0; node->stateful_merge = 0; /* If we never accepted a valid proto from this node, then it never fully joined and there's no need to recover it. Similary, node_history_lockspace_fail only sets need_fencing in the lockspace if node->start_time was non-zero. */ if (node->proto.daemon_max[0]) { /* tell loop below to look at this node */ node->recover_setup = 1; } else { log_debug("daemon remove %d no proto skip recover", node->nodeid); } memset(&node->proto, 0, sizeof(struct protocol)); } } list_for_each_entry(node, &daemon_nodes, list) { if (node->daemon_member) check_member_count++; } /* when we join, all previous members look like they are joining */ if (!we_joined && (daemon_joined_count != check_joined_count || daemon_remove_count != check_remove_count || daemon_member_count != check_member_count)) { log_error("daemon counts joined %d check %d remove %d check %d member %d check %d", daemon_joined_count, check_joined_count, daemon_remove_count, check_remove_count, daemon_member_count, check_member_count); } /* set up recovery work for nodes that just failed (recover_setup set above) */ list_for_each_entry(node, &daemon_nodes, list) { if (!node->recover_setup) continue; node->recover_setup = 0; reason = 0; low = 0; if (!opt(enable_fencing_ind)) continue; if (node->need_fencing) { log_error("daemon remove %d already needs fencing", node->nodeid); continue; } for (i = 0; i < left_list_entries; i++) { if (left_list[i].nodeid != node->nodeid) continue; reason = left_list[i].reason; break; } if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) { if (node->fence_pid_wait || node->fence_pid) { /* sanity check, should never happen */ log_error("daemon remove %d pid_wait %d pid %d", node->nodeid, node->fence_pid_wait, node->fence_pid); } node->need_fencing = 1; node->delay_fencing = 0; node->fence_monotime = 0; node->fence_walltime = 0; node->fence_actor_last = 0; node->fence_actor_done = 0; node->fence_pid_wait = 0; node->fence_pid = 0; node->fence_result_wait = 0; node->fence_config.pos = 0; node->left_reason = reason; node->fail_monotime = now; node->fail_walltime = now_wall; low = set_fence_actors(node, 0); } log_debug("daemon remove %d %s need_fencing %d low %d", node->nodeid, reason_str(reason), node->need_fencing, low); } daemon_fence_work(); } static void totem_cb_daemon(cpg_handle_t handle, struct cpg_ring_id ring_id, uint32_t member_list_entries, const uint32_t *member_list) { daemon_ringid.nodeid = ring_id.nodeid; daemon_ringid.seq = ring_id.seq; daemon_ringid_wait = 0; log_ringid("dlm:controld", &ring_id, member_list, member_list_entries); daemon_fence_work(); } static cpg_model_v1_data_t cpg_callbacks_daemon = { .cpg_deliver_fn = deliver_cb_daemon, .cpg_confchg_fn = confchg_cb_daemon, .cpg_totem_confchg_fn = totem_cb_daemon, .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, }; void process_cpg_daemon(int ci) { cs_error_t error; error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL); if (error != CS_OK && error != CS_ERR_BAD_HANDLE) log_error("daemon cpg_dispatch error %d", error); } int setup_cpg_daemon(void) { cs_error_t error; struct cpg_name name; int i = 0; /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible with cluster4/RHEL7 */ memset(&our_protocol, 0, sizeof(our_protocol)); if (opt(enable_fscontrol_ind)) our_protocol.daemon_max[0] = 2; else our_protocol.daemon_max[0] = 3; our_protocol.daemon_max[1] = 1; our_protocol.daemon_max[2] = 1; our_protocol.kernel_max[0] = 1; our_protocol.kernel_max[1] = 1; our_protocol.kernel_max[2] = 1; error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_callbacks_daemon, NULL); if (error != CS_OK) { log_error("daemon cpg_initialize error %d", error); return -1; } cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon); memset(&name, 0, sizeof(name)); sprintf(name.value, "dlm:controld"); name.length = strlen(name.value) + 1; log_debug("cpg_join %s ...", name.value); retry: error = cpg_join(cpg_handle_daemon, &name); if (error == CS_ERR_TRY_AGAIN) { sleep(1); if (!(++i % 10)) log_error("daemon cpg_join error retrying"); goto retry; } if (error != CS_OK) { log_error("daemon cpg_join error %d", error); goto fail; } log_debug("setup_cpg_daemon %d", cpg_fd_daemon); return cpg_fd_daemon; fail: cpg_finalize(cpg_handle_daemon); return -1; } static void stop_lockspaces(void) { struct lockspace *ls; list_for_each_entry(ls, &lockspaces, list) { cpg_stop_kernel(ls); } } void close_cpg_daemon(void) { struct lockspace *ls; cs_error_t error; struct cpg_name name; int i = 0; if (!cpg_handle_daemon) { stop_lockspaces(); return; } if (cluster_down) goto fin; memset(&name, 0, sizeof(name)); sprintf(name.value, "dlm:controld"); name.length = strlen(name.value) + 1; log_debug("cpg_leave %s ...", name.value); retry: error = cpg_leave(cpg_handle_daemon, &name); if (error == CS_ERR_TRY_AGAIN) { sleep(1); if (!(++i % 10)) log_error("daemon cpg_leave error retrying"); goto retry; } if (error != CS_OK) log_error("daemon cpg_leave error %d", error); fin: list_for_each_entry(ls, &lockspaces, list) { /* stop kernel ls lock activity before configfs cleanup */ cpg_stop_kernel(ls); if (ls->cpg_handle) cpg_finalize(ls->cpg_handle); } cpg_finalize(cpg_handle_daemon); } void init_daemon(void) { INIT_LIST_HEAD(&daemon_nodes); INIT_LIST_HEAD(&startup_nodes); } static int print_state_daemon_node(struct node_daemon *node, char *str) { snprintf(str, DLMC_STATE_MAXSTR-1, "member=%d " "killed=%d " "left_reason=%s " "need_fencing=%d " "delay_fencing=%d " "fence_pid=%d " "fence_pid_wait=%d " "fence_result_wait=%d " "fence_actor_last=%d " "fence_actor_done=%d " "add_time=%llu " "rem_time=%llu " "fail_walltime=%llu " "fail_monotime=%llu " "fence_walltime=%llu " "fence_monotime=%llu ", node->daemon_member, node->killed, reason_str(node->left_reason), node->need_fencing, node->delay_fencing, node->fence_pid, node->fence_pid_wait, node->fence_result_wait, node->fence_actor_last, node->fence_actor_done, (unsigned long long)node->daemon_add_time, (unsigned long long)node->daemon_rem_time, (unsigned long long)node->fail_walltime, (unsigned long long)node->fail_monotime, (unsigned long long)node->fence_walltime, (unsigned long long)node->fence_monotime); return strlen(str) + 1; } void send_state_daemon_nodes(int fd) { struct node_daemon *node; struct dlmc_state st; char str[DLMC_STATE_MAXSTR]; int str_len; list_for_each_entry(node, &daemon_nodes, list) { memset(&st, 0, sizeof(st)); st.type = DLMC_STATE_DAEMON_NODE; st.nodeid = node->nodeid; memset(str, 0, sizeof(str)); str_len = print_state_daemon_node(node, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } } void send_state_startup_nodes(int fd) { struct node_daemon *node; struct dlmc_state st; char str[DLMC_STATE_MAXSTR]; int str_len; list_for_each_entry(node, &startup_nodes, list) { memset(&st, 0, sizeof(st)); st.type = DLMC_STATE_STARTUP_NODE; st.nodeid = node->nodeid; memset(str, 0, sizeof(str)); str_len = print_state_daemon_node(node, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } } static int print_state_daemon(char *str) { snprintf(str, DLMC_STATE_MAXSTR-1, "member_count=%d " "joined_count=%d " "remove_count=%d " "daemon_ringid=%llu " "cluster_ringid=%llu " "quorate=%d " "fence_pid=%d " "fence_in_progress_unknown=%d " "zombie_count=%d " "monotime=%llu " "stateful_merge_wait=%d ", daemon_member_count, daemon_joined_count, daemon_remove_count, (unsigned long long)daemon_ringid.seq, (unsigned long long)cluster_ringid_seq, cluster_quorate, daemon_fence_pid, fence_in_progress_unknown, zombie_count, (unsigned long long)monotime(), stateful_merge_wait); return strlen(str) + 1; } void send_state_daemon(int fd) { struct dlmc_state st; char str[DLMC_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = DLMC_STATE_DAEMON; st.nodeid = our_nodeid; memset(str, 0, sizeof(str)); str_len = print_state_daemon(str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } dlm-4.3.0/dlm_controld/deadlock.c000066400000000000000000001110501461150666200166710ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #include "libdlm.h" static SaCkptHandleT global_ckpt_h; static SaCkptCallbacksT callbacks = { 0, 0 }; static SaVersionT version = { 'B', 1, 1 }; static char section_buf[10 * 1024 * 1024]; /* 10MB of pack_lock's enough? */ static uint32_t section_len; static uint32_t section_max; struct node { struct list_head list; int nodeid; int checkpoint_ready; /* we've read its ckpt */ int in_cycle; /* participating in cycle */ }; enum { LOCAL_COPY = 1, MASTER_COPY = 2, }; /* from linux/fs/dlm/dlm_internal.h */ #define DLM_LKSTS_WAITING 1 #define DLM_LKSTS_GRANTED 2 #define DLM_LKSTS_CONVERT 3 struct pack_lock { uint64_t xid; uint32_t id; int nodeid; uint32_t remid; int ownpid; uint32_t exflags; uint32_t flags; int8_t status; int8_t grmode; int8_t rqmode; int8_t copy; }; struct dlm_rsb { struct list_head list; struct list_head locks; char name[DLM_RESNAME_MAXLEN]; int len; }; /* information is saved in the lkb, and lkb->lock, from the perspective of the local or master copy, not the process copy */ struct dlm_lkb { struct list_head list; /* r->locks */ struct pack_lock lock; /* data from debugfs/checkpoint */ int home; /* node where the lock owner lives*/ struct dlm_rsb *rsb; /* lock is on resource */ struct trans *trans; /* lock owned by this transaction */ struct list_head trans_list; /* tr->locks */ struct trans *waitfor_trans; /* the trans that's holding the lock that's blocking us */ }; /* waitfor pointers alloc'ed 4 at at time */ #define TR_NALLOC 4 struct trans { struct list_head list; struct list_head locks; uint64_t xid; int others_waiting_on_us; /* count of trans's pointing to us in waitfor */ int waitfor_alloc; int waitfor_count; /* count of in-use waitfor slots and num of trans's we're waiting on */ struct trans **waitfor; /* waitfor_alloc trans pointers */ }; static const int __dlm_compat_matrix[8][8] = { /* UN NL CR CW PR PW EX PD */ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ }; static inline int dlm_modes_compat(int mode1, int mode2) { return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; } static const char *status_str(int lksts) { switch (lksts) { case DLM_LKSTS_WAITING: return "W"; case DLM_LKSTS_GRANTED: return "G"; case DLM_LKSTS_CONVERT: return "C"; } return "?"; } static void free_resources(struct lockspace *ls) { struct dlm_rsb *r, *r_safe; struct dlm_lkb *lkb, *lkb_safe; list_for_each_entry_safe(r, r_safe, &ls->resources, list) { list_for_each_entry_safe(lkb, lkb_safe, &r->locks, list) { list_del(&lkb->list); if (!list_empty(&lkb->trans_list)) list_del(&lkb->trans_list); free(lkb); } list_del(&r->list); free(r); } } static void free_transactions(struct lockspace *ls) { struct trans *tr, *tr_safe; list_for_each_entry_safe(tr, tr_safe, &ls->transactions, list) { list_del(&tr->list); if (tr->waitfor) free(tr->waitfor); free(tr); } } static void disable_deadlock(void) { log_error("FIXME: deadlock detection disabled"); } void setup_deadlock(void) { SaAisErrorT rv; if (!cfgd_enable_deadlk) return; rv = saCkptInitialize(&global_ckpt_h, &callbacks, &version); if (rv != SA_AIS_OK) log_error("ckpt init error %d", rv); } static struct dlm_rsb *get_resource(struct lockspace *ls, char *name, int len) { struct dlm_rsb *r; list_for_each_entry(r, &ls->resources, list) { if (r->len == len && !strncmp(r->name, name, len)) return r; } r = malloc(sizeof(struct dlm_rsb)); if (!r) { log_error("get_resource: no memory"); disable_deadlock(); return NULL; } memset(r, 0, sizeof(struct dlm_rsb)); memcpy(r->name, name, len); r->len = len; INIT_LIST_HEAD(&r->locks); list_add(&r->list, &ls->resources); return r; } static struct dlm_lkb *create_lkb(void) { struct dlm_lkb *lkb; lkb = malloc(sizeof(struct dlm_lkb)); if (!lkb) { log_error("create_lkb: no memory"); disable_deadlock(); } else { memset(lkb, 0, sizeof(struct dlm_lkb)); INIT_LIST_HEAD(&lkb->list); INIT_LIST_HEAD(&lkb->trans_list); } return lkb; } static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) { list_add(&lkb->list, &r->locks); lkb->rsb = r; } /* from linux/fs/dlm/dlm_internal.h */ #define IFL_MSTCPY 0x00010000 /* called on a lock that's just been read from debugfs */ static void set_copy(struct pack_lock *lock) { uint32_t id, remid; if (!lock->nodeid) lock->copy = LOCAL_COPY; else if (lock->flags & IFL_MSTCPY) lock->copy = MASTER_COPY; else { /* process copy lock is converted to a partial master copy lock that will be combined with the real master copy */ lock->copy = MASTER_COPY; id = lock->id; remid = lock->remid; lock->id = remid; lock->remid = id; lock->nodeid = our_nodeid; } } /* xid is always zero in the real master copy, xid should always be non-zero in the partial master copy (what was a process copy) */ /* TODO: confirm or enforce that the partial will always have non-zero xid */ static int partial_master_copy(struct pack_lock *lock) { return (lock->xid != 0); } static struct dlm_lkb *get_lkb(struct dlm_rsb *r, struct pack_lock *lock) { struct dlm_lkb *lkb; if (lock->copy != MASTER_COPY) goto out; list_for_each_entry(lkb, &r->locks, list) { if (lkb->lock.nodeid == lock->nodeid && lkb->lock.id == lock->id) return lkb; } out: return create_lkb(); } static struct dlm_lkb *add_lock(struct lockspace *ls, struct dlm_rsb *r, int from_nodeid, struct pack_lock *lock) { struct dlm_lkb *lkb; lkb = get_lkb(r, lock); if (!lkb) return NULL; switch (lock->copy) { case LOCAL_COPY: lkb->lock.xid = lock->xid; lkb->lock.nodeid = lock->nodeid; lkb->lock.id = lock->id; lkb->lock.remid = lock->remid; lkb->lock.ownpid = lock->ownpid; lkb->lock.exflags = lock->exflags; lkb->lock.flags = lock->flags; lkb->lock.status = lock->status; lkb->lock.grmode = lock->grmode; lkb->lock.rqmode = lock->rqmode; lkb->lock.copy = LOCAL_COPY; lkb->home = from_nodeid; log_group(ls, "add %s local nodeid %d id %x remid %x xid %llx", r->name, lock->nodeid, lock->id, lock->remid, (unsigned long long)lock->xid); break; case MASTER_COPY: if (partial_master_copy(lock)) { lkb->lock.xid = lock->xid; lkb->lock.nodeid = lock->nodeid; lkb->lock.id = lock->id; lkb->lock.remid = lock->remid; lkb->lock.copy = MASTER_COPY; } else { /* only set xid from partial master copy above */ lkb->lock.nodeid = lock->nodeid; lkb->lock.id = lock->id; lkb->lock.remid = lock->remid; lkb->lock.copy = MASTER_COPY; /* set other fields from real master copy */ lkb->lock.ownpid = lock->ownpid; lkb->lock.exflags = lock->exflags; lkb->lock.flags = lock->flags; lkb->lock.status = lock->status; lkb->lock.grmode = lock->grmode; lkb->lock.rqmode = lock->rqmode; } lkb->home = lock->nodeid; log_group(ls, "add %s master nodeid %d id %x remid %x xid %llx", r->name, lock->nodeid, lock->id, lock->remid, (unsigned long long)lock->xid); break; } if (list_empty(&lkb->list)) add_lkb(r, lkb); return lkb; } static void parse_r_name(char *line, char *name) { char *p; int i = 0; int begin = 0; for (p = line; ; p++) { if (*p == '"') { if (begin) break; begin = 1; continue; } if (begin) name[i++] = *p; } } #define LOCK_LINE_MAX 1024 static int read_debugfs_locks(struct lockspace *ls) { FILE *file; char path[PATH_MAX]; char line[LOCK_LINE_MAX]; struct dlm_rsb *r; struct pack_lock lock; char r_name[65]; unsigned long long xid; unsigned int waiting; int r_nodeid; int r_len; int rv; snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_locks", ls->name); file = fopen(path, "r"); if (!file) return -1; /* skip the header on the first line */ if (!fgets(line, LOCK_LINE_MAX, file)) { log_error("Unable to read %s: %d", path, errno); goto out; } while (fgets(line, LOCK_LINE_MAX, file)) { memset(&lock, 0, sizeof(struct pack_lock)); rv = sscanf(line, "%x %d %x %u %llu %x %x %hhd %hhd %hhd %u %d %d", &lock.id, &lock.nodeid, &lock.remid, &lock.ownpid, &xid, &lock.exflags, &lock.flags, &lock.status, &lock.grmode, &lock.rqmode, &waiting, &r_nodeid, &r_len); lock.xid = xid; /* hack to avoid warning */ if (rv != 13) { log_error("invalid debugfs line %d: %s", rv, line); goto out; } memset(r_name, 0, sizeof(r_name)); parse_r_name(line, r_name); r = get_resource(ls, r_name, r_len); if (!r) break; set_copy(&lock); add_lock(ls, r, our_nodeid, &lock); } out: fclose(file); return 0; } static int read_checkpoint_locks(struct lockspace *ls, int from_nodeid, char *numbuf, int buflen) { struct dlm_rsb *r; struct pack_lock *lock; int count = section_len / sizeof(struct pack_lock); int i; r = get_resource(ls, numbuf, buflen - 1); if (!r) return -1; lock = (struct pack_lock *) §ion_buf; for (i = 0; i < count; i++) { lock->xid = le64_to_cpu(lock->xid); lock->id = le32_to_cpu(lock->id); lock->nodeid = le32_to_cpu(lock->nodeid); lock->remid = le32_to_cpu(lock->remid); lock->ownpid = le32_to_cpu(lock->ownpid); lock->exflags = le32_to_cpu(lock->exflags); lock->flags = le32_to_cpu(lock->flags); add_lock(ls, r, from_nodeid, lock); lock++; } return 0; } static int pack_lkb_list(struct list_head *q, struct pack_lock **lockp) { struct dlm_lkb *lkb; struct pack_lock *lock = *lockp; int count = 0; list_for_each_entry(lkb, q, list) { if (count + 1 > section_max) { log_error("too many locks %d for ckpt buf", count); break; } lock->xid = cpu_to_le64(lkb->lock.xid); lock->id = cpu_to_le32(lkb->lock.id); lock->nodeid = cpu_to_le32(lkb->lock.nodeid); lock->remid = cpu_to_le32(lkb->lock.remid); lock->ownpid = cpu_to_le32(lkb->lock.ownpid); lock->exflags = cpu_to_le32(lkb->lock.exflags); lock->flags = cpu_to_le32(lkb->lock.flags); lock->status = lkb->lock.status; lock->grmode = lkb->lock.grmode; lock->rqmode = lkb->lock.rqmode; lock->copy = lkb->lock.copy; lock++; count++; } return count; } static void pack_section_buf(struct lockspace *ls, struct dlm_rsb *r) { struct pack_lock *lock; int count; memset(§ion_buf, 0, sizeof(section_buf)); section_max = sizeof(section_buf) / sizeof(struct pack_lock); lock = (struct pack_lock *) §ion_buf; count = pack_lkb_list(&r->locks, &lock); section_len = count * sizeof(struct pack_lock); } static int _unlink_checkpoint(struct lockspace *ls, SaNameT *name) { SaCkptCheckpointHandleT h; SaCkptCheckpointDescriptorT s; SaAisErrorT rv; int ret = 0; int retries; h = (SaCkptCheckpointHandleT) ls->deadlk_ckpt_handle; log_group(ls, "unlink ckpt %llx", (unsigned long long)h); retries = 0; unlink_retry: rv = saCkptCheckpointUnlink(global_ckpt_h, name); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "unlink ckpt retry"); sleep(1); if (retries++ < 10) goto unlink_retry; } if (rv == SA_AIS_OK) goto out_close; if (!h) goto out; log_error("unlink ckpt error %d %s", rv, ls->name); ret = -1; retries = 0; status_retry: rv = saCkptCheckpointStatusGet(h, &s); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "unlink ckpt status retry"); sleep(1); if (retries++ < 10) goto status_retry; } if (rv != SA_AIS_OK) { log_error("unlink ckpt status error %d %s", rv, ls->name); goto out_close; } log_group(ls, "unlink ckpt status: size %llu, max sections %u, " "max section size %llu, section count %u, mem %u", (unsigned long long)s.checkpointCreationAttributes.checkpointSize, s.checkpointCreationAttributes.maxSections, (unsigned long long)s.checkpointCreationAttributes.maxSectionSize, s.numberOfSections, s.memoryUsed); out_close: retries = 0; close_retry: rv = saCkptCheckpointClose(h); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "unlink ckpt close retry"); sleep(1); if (retries++ < 10) goto close_retry; } if (rv != SA_AIS_OK) { log_error("unlink ckpt %llx close err %d %s", (unsigned long long)h, rv, ls->name); } out: ls->deadlk_ckpt_handle = 0; return ret; } static int unlink_checkpoint(struct lockspace *ls) { SaNameT name; int len; len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d", ls->name, our_nodeid); name.length = len; return _unlink_checkpoint(ls, &name); } static void read_checkpoint(struct lockspace *ls, int nodeid) { SaCkptCheckpointHandleT h; SaCkptSectionIterationHandleT itr; SaCkptSectionDescriptorT desc; SaCkptIOVectorElementT iov; SaNameT name; SaAisErrorT rv; char buf[DLM_RESNAME_MAXLEN]; int len; int retries; if (nodeid == our_nodeid) return; log_group(ls, "read_checkpoint %d", nodeid); len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d", ls->name, nodeid); name.length = len; retries = 0; open_retry: rv = saCkptCheckpointOpen(global_ckpt_h, &name, NULL, SA_CKPT_CHECKPOINT_READ, 0, &h); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "read_checkpoint: %d ckpt open retry", nodeid); sleep(1); if (retries++ < 10) goto open_retry; } if (rv != SA_AIS_OK) { log_error("read_checkpoint: %d ckpt open error %d", nodeid, rv); return; } retries = 0; init_retry: rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "read_checkpoint: ckpt iterinit retry"); sleep(1); if (retries++ < 10) goto init_retry; } if (rv != SA_AIS_OK) { log_error("read_checkpoint: %d ckpt iterinit error %d", nodeid, rv); goto out; } while (1) { retries = 0; next_retry: rv = saCkptSectionIterationNext(itr, &desc); if (rv == SA_AIS_ERR_NO_SECTIONS) break; if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "read_checkpoint: ckpt iternext retry"); sleep(1); if (retries++ < 10) goto next_retry; } if (rv != SA_AIS_OK) { log_error("read_checkpoint: %d ckpt iternext error %d", nodeid, rv); goto out_it; } if (!desc.sectionSize) continue; iov.sectionId = desc.sectionId; iov.dataBuffer = §ion_buf; iov.dataSize = desc.sectionSize; iov.dataOffset = 0; memset(&buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%s", desc.sectionId.id); log_group(ls, "read_checkpoint: section size %llu id %u \"%s\"", (unsigned long long)iov.dataSize, iov.sectionId.idLen, buf); retries = 0; read_retry: rv = saCkptCheckpointRead(h, &iov, 1, NULL); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "read_checkpoint: ckpt read retry"); sleep(1); if (retries++ < 10) goto read_retry; } if (rv != SA_AIS_OK) { log_error("read_checkpoint: %d ckpt read error %d", nodeid, rv); goto out_it; } section_len = iov.readSize; if (!section_len) continue; if (section_len % sizeof(struct pack_lock)) { log_error("read_checkpoint: %d bad section len %d", nodeid, section_len); continue; } read_checkpoint_locks(ls, nodeid, (char *)desc.sectionId.id, desc.sectionId.idLen); } out_it: saCkptSectionIterationFinalize(itr); retries = 0; out: rv = saCkptCheckpointClose(h); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "read_checkpoint: unlink ckpt close retry"); sleep(1); if (retries++ < 10) goto out; } if (rv != SA_AIS_OK) log_error("read_checkpoint: %d close error %d", nodeid, rv); } static void write_checkpoint(struct lockspace *ls) { SaCkptCheckpointCreationAttributesT attr; SaCkptCheckpointHandleT h; SaCkptSectionIdT section_id; SaCkptSectionCreationAttributesT section_attr; SaCkptCheckpointOpenFlagsT flags; SaNameT name; SaAisErrorT rv; char buf[DLM_RESNAME_MAXLEN]; struct dlm_rsb *r; struct dlm_lkb *lkb; int r_count, lock_count, total_size, section_size, max_section_size; int len; len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "dlmdeadlk.%s.%d", ls->name, our_nodeid); name.length = len; /* unlink an old checkpoint before we create a new one */ if (ls->deadlk_ckpt_handle) { log_error("write_checkpoint: old ckpt"); if (_unlink_checkpoint(ls, &name)) return; } /* loop through all locks to figure out sizes to set in the attr fields */ r_count = 0; lock_count = 0; total_size = 0; max_section_size = 0; list_for_each_entry(r, &ls->resources, list) { r_count++; section_size = 0; list_for_each_entry(lkb, &r->locks, list) { section_size += sizeof(struct pack_lock); lock_count++; } total_size += section_size; if (section_size > max_section_size) max_section_size = section_size; } log_group(ls, "write_checkpoint: r_count %d, lock_count %d", r_count, lock_count); log_group(ls, "write_checkpoint: total %d bytes, max_section %d bytes", total_size, max_section_size); attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS; attr.checkpointSize = total_size; attr.retentionDuration = SA_TIME_MAX; attr.maxSections = r_count + 1; /* don't know why we need +1 */ attr.maxSectionSize = max_section_size; attr.maxSectionIdSize = DLM_RESNAME_MAXLEN; flags = SA_CKPT_CHECKPOINT_READ | SA_CKPT_CHECKPOINT_WRITE | SA_CKPT_CHECKPOINT_CREATE; open_retry: rv = saCkptCheckpointOpen(global_ckpt_h, &name, &attr, flags, 0, &h); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "write_checkpoint: ckpt open retry"); sleep(1); goto open_retry; } if (rv == SA_AIS_ERR_EXIST) { log_group(ls, "write_checkpoint: ckpt already exists"); return; } if (rv != SA_AIS_OK) { log_group(ls, "write_checkpoint: ckpt open error %d", rv); return; } log_group(ls, "write_checkpoint: open ckpt handle %llx", (unsigned long long)h); ls->deadlk_ckpt_handle = (uint64_t) h; list_for_each_entry(r, &ls->resources, list) { memset(buf, 0, sizeof(buf)); len = snprintf(buf, sizeof(buf), "%s", r->name); section_id.id = (void *)buf; section_id.idLen = len + 1; section_attr.sectionId = §ion_id; section_attr.expirationTime = SA_TIME_END; pack_section_buf(ls, r); log_group(ls, "write_checkpoint: section size %u id %u \"%s\"", section_len, section_id.idLen, buf); create_retry: rv = saCkptSectionCreate(h, §ion_attr, §ion_buf, section_len); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(ls, "write_checkpoint: ckpt create retry"); sleep(1); goto create_retry; } if (rv == SA_AIS_ERR_EXIST) { /* this shouldn't happen in general */ log_error("write_checkpoint: clearing old ckpt"); saCkptCheckpointClose(h); _unlink_checkpoint(ls, &name); goto open_retry; } if (rv != SA_AIS_OK) { log_error("write_checkpoint: section create %d", rv); break; } } } static void send_message(struct lockspace *ls, int type, uint32_t to_nodeid, uint32_t msgdata) { struct dlm_header *hd; int len; char *buf; len = sizeof(struct dlm_header); buf = malloc(len); if (!buf) { log_error("send_message: no memory"); disable_deadlock(); return; } memset(buf, 0, len); hd = (struct dlm_header *)buf; hd->type = type; hd->to_nodeid = to_nodeid; hd->msgdata = msgdata; dlm_send_message(ls, buf, len); free(buf); } static void send_checkpoint_ready(struct lockspace *ls) { log_group(ls, "send_checkpoint_ready"); send_message(ls, DLM_MSG_DEADLK_CHECKPOINT_READY, 0, 0); } void send_cycle_start(struct lockspace *ls) { log_group(ls, "send_cycle_start"); send_message(ls, DLM_MSG_DEADLK_CYCLE_START, 0, 0); } static void send_cycle_end(struct lockspace *ls) { log_group(ls, "send_cycle_end"); send_message(ls, DLM_MSG_DEADLK_CYCLE_END, 0, 0); } static void send_cancel_lock(struct lockspace *ls, struct trans *tr, struct dlm_lkb *lkb) { int to_nodeid; uint32_t lkid; if (!lkb->lock.nodeid) lkid = lkb->lock.id; else lkid = lkb->lock.remid; to_nodeid = lkb->home; log_group(ls, "send_cancel_lock to nodeid %d rsb %s id %x xid %llx", to_nodeid, lkb->rsb->name, lkid, (unsigned long long)lkb->lock.xid); send_message(ls, DLM_MSG_DEADLK_CANCEL_LOCK, to_nodeid, lkid); } static void dump_resources(struct lockspace *ls) { struct dlm_rsb *r; struct dlm_lkb *lkb; log_group(ls, "Resource dump:"); list_for_each_entry(r, &ls->resources, list) { log_group(ls, "\"%s\" len %d", r->name, r->len); list_for_each_entry(lkb, &r->locks, list) { log_group(ls, " %s: nodeid %d id %08x remid %08x gr %s rq %s pid %u xid %llx", status_str(lkb->lock.status), lkb->lock.nodeid, lkb->lock.id, lkb->lock.remid, dlm_mode_str(lkb->lock.grmode), dlm_mode_str(lkb->lock.rqmode), lkb->lock.ownpid, (unsigned long long)lkb->lock.xid); } } } static void find_deadlock(struct lockspace *ls); static void run_deadlock(struct lockspace *ls) { struct node *node; int not_ready = 0; int low = -1; if (ls->all_checkpoints_ready) log_group(ls, "WARNING: run_deadlock all_checkpoints_ready"); list_for_each_entry(node, &ls->deadlk_nodes, list) { if (!node->in_cycle) continue; if (!node->checkpoint_ready) not_ready++; log_group(ls, "nodeid %d checkpoint_ready = %d", node->nodeid, node->checkpoint_ready); } if (not_ready) return; ls->all_checkpoints_ready = 1; list_for_each_entry(node, &ls->deadlk_nodes, list) { if (!node->in_cycle) continue; if (node->nodeid < low || low == -1) low = node->nodeid; } ls->deadlk_low_nodeid = low; if (low == our_nodeid) find_deadlock(ls); else log_group(ls, "defer resolution to low nodeid %d", low); } void receive_checkpoint_ready(struct lockspace *ls, struct dlm_header *hd, int len) { struct node *node; int nodeid = hd->nodeid; log_group(ls, "receive_checkpoint_ready from %d", nodeid); read_checkpoint(ls, nodeid); list_for_each_entry(node, &ls->deadlk_nodes, list) { if (node->nodeid == nodeid) { node->checkpoint_ready = 1; break; } } run_deadlock(ls); } void receive_cycle_start(struct lockspace *ls, struct dlm_header *hd, int len) { struct node *node; int nodeid = hd->nodeid; int rv; log_group(ls, "receive_cycle_start from %d", nodeid); if (ls->cycle_running) { log_group(ls, "cycle already running"); return; } ls->cycle_running = 1; gettimeofday(&ls->cycle_start_time, NULL); list_for_each_entry(node, &ls->deadlk_nodes, list) node->in_cycle = 1; rv = read_debugfs_locks(ls); if (rv < 0) { log_error("can't read dlm debugfs file: %s", strerror(errno)); return; } write_checkpoint(ls); send_checkpoint_ready(ls); } static uint64_t dt_usec(struct timeval *start, struct timeval *stop) { uint64_t dt; dt = stop->tv_sec - start->tv_sec; dt *= 1000000; dt += stop->tv_usec - start->tv_usec; return dt; } /* TODO: nodes added during a cycle - what will they do with messages they recv from other nodes running the cycle? */ void receive_cycle_end(struct lockspace *ls, struct dlm_header *hd, int len) { struct node *node; int nodeid = hd->nodeid; uint64_t usec; if (!ls->cycle_running) { log_error("receive_cycle_end %s from %d: no cycle running", ls->name, nodeid); return; } gettimeofday(&ls->cycle_end_time, NULL); usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time); log_group(ls, "receive_cycle_end: from %d cycle time %.2f s", nodeid, usec * 1.e-6); ls->cycle_running = 0; ls->all_checkpoints_ready = 0; list_for_each_entry(node, &ls->deadlk_nodes, list) node->checkpoint_ready = 0; free_resources(ls); free_transactions(ls); unlink_checkpoint(ls); } void receive_cancel_lock(struct lockspace *ls, struct dlm_header *hd, int len) { dlm_lshandle_t h; int nodeid = hd->nodeid; uint32_t lkid = hd->msgdata; int rv; if (nodeid != our_nodeid) return; h = dlm_open_lockspace(ls->name); if (!h) { log_error("deadlock cancel %x from %d can't open lockspace %s", lkid, nodeid, ls->name); return; } log_group(ls, "receive_cancel_lock %x from %d", lkid, nodeid); rv = dlm_ls_deadlock_cancel(h, lkid, 0); if (rv < 0) { log_error("deadlock cancel %x from %x lib cancel errno %d", lkid, nodeid, errno); } dlm_close_lockspace(h); } static void node_joined(struct lockspace *ls, int nodeid) { struct node *node; node = malloc(sizeof(struct node)); if (!node) { log_error("node_joined: no memory"); disable_deadlock(); return; } memset(node, 0, sizeof(struct node)); node->nodeid = nodeid; list_add_tail(&node->list, &ls->deadlk_nodes); log_group(ls, "node %d joined deadlock cpg", nodeid); } static void node_left(struct lockspace *ls, int nodeid, int reason) { struct node *node, *safe; list_for_each_entry_safe(node, safe, &ls->deadlk_nodes, list) { if (node->nodeid != nodeid) continue; list_del(&node->list); free(node); log_group(ls, "node %d left deadlock cpg", nodeid); } } static void purge_locks(struct lockspace *ls, int nodeid); void deadlk_confchg(struct lockspace *ls, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { int i; if (!cfgd_enable_deadlk) return; if (!ls->deadlk_confchg_init) { ls->deadlk_confchg_init = 1; for (i = 0; i < member_list_entries; i++) node_joined(ls, member_list[i].nodeid); return; } /* nodes added during a cycle won't have node->in_cycle set so they won't be included in any of the cycle processing */ for (i = 0; i < joined_list_entries; i++) node_joined(ls, joined_list[i].nodeid); for (i = 0; i < left_list_entries; i++) node_left(ls, left_list[i].nodeid, left_list[i].reason); if (!ls->cycle_running) return; if (!left_list_entries) return; if (!ls->all_checkpoints_ready) { run_deadlock(ls); return; } for (i = 0; i < left_list_entries; i++) purge_locks(ls, left_list[i].nodeid); for (i = 0; i < left_list_entries; i++) { if (left_list[i].nodeid != ls->deadlk_low_nodeid) continue; /* this will set a new low node which will call find_deadlock */ run_deadlock(ls); break; } } /* would we ever call this after we've created the transaction lists? I don't think so; I think it can only be called between reading checkpoints */ static void purge_locks(struct lockspace *ls, int nodeid) { struct dlm_rsb *r; struct dlm_lkb *lkb, *safe; list_for_each_entry(r, &ls->resources, list) { list_for_each_entry_safe(lkb, safe, &r->locks, list) { if (lkb->home == nodeid) { list_del(&lkb->list); if (list_empty(&lkb->trans_list)) free(lkb); else log_group(ls, "purge %d %x on trans", nodeid, lkb->lock.id); } } } } static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb) { list_add(&lkb->trans_list, &tr->locks); lkb->trans = tr; } static struct trans *get_trans(struct lockspace *ls, uint64_t xid) { struct trans *tr; list_for_each_entry(tr, &ls->transactions, list) { if (tr->xid == xid) return tr; } tr = malloc(sizeof(struct trans)); if (!tr) { log_error("get_trans: no memory"); disable_deadlock(); return NULL; } memset(tr, 0, sizeof(struct trans)); tr->xid = xid; tr->waitfor = NULL; tr->waitfor_alloc = 0; tr->waitfor_count = 0; INIT_LIST_HEAD(&tr->locks); list_add(&tr->list, &ls->transactions); return tr; } /* for each rsb, for each lock, find/create trans, add lkb to the trans list */ static void create_trans_list(struct lockspace *ls) { struct dlm_rsb *r; struct dlm_lkb *lkb; struct trans *tr; int r_count = 0, lkb_count = 0; list_for_each_entry(r, &ls->resources, list) { r_count++; list_for_each_entry(lkb, &r->locks, list) { lkb_count++; tr = get_trans(ls, lkb->lock.xid); if (!tr) goto out; add_lkb_trans(tr, lkb); } } out: log_group(ls, "create_trans_list: r_count %d lkb_count %d", r_count, lkb_count); } static int locks_compat(struct dlm_lkb *waiting_lkb, struct dlm_lkb *granted_lkb) { if (waiting_lkb == granted_lkb) { log_debug("waiting and granted same lock"); return 0; } if (waiting_lkb->trans->xid == granted_lkb->trans->xid) { log_debug("waiting and granted same trans %llx", (unsigned long long)waiting_lkb->trans->xid); return 0; } return dlm_modes_compat(granted_lkb->lock.grmode, waiting_lkb->lock.rqmode); } static int in_waitfor(struct trans *tr, struct trans *add_tr) { int i; for (i = 0; i < tr->waitfor_alloc; i++) { if (!tr->waitfor[i]) continue; if (tr->waitfor[i] == add_tr) return 1; } return 0; } static void add_waitfor(struct lockspace *ls, struct dlm_lkb *waiting_lkb, struct dlm_lkb *granted_lkb) { struct trans *tr = waiting_lkb->trans; int i; if (locks_compat(waiting_lkb, granted_lkb)) return; /* this shouldn't happen AFAIK */ if (tr == granted_lkb->trans) { log_group(ls, "trans %llx waiting on self", (unsigned long long)tr->xid); return; } /* don't add the same trans to the waitfor list multiple times */ if (tr->waitfor_count && in_waitfor(tr, granted_lkb->trans)) { log_group(ls, "trans %llx already waiting for trans %llx, " "waiting %x %s, granted %x %s", (unsigned long long)waiting_lkb->trans->xid, (unsigned long long)granted_lkb->trans->xid, waiting_lkb->lock.id, waiting_lkb->rsb->name, granted_lkb->lock.id, granted_lkb->rsb->name); return; } if (tr->waitfor_count == tr->waitfor_alloc) { struct trans **old_waitfor = tr->waitfor; tr->waitfor_alloc += TR_NALLOC; tr->waitfor = malloc(tr->waitfor_alloc * sizeof(tr)); if (!tr->waitfor) { log_error("add_waitfor no mem %u", tr->waitfor_alloc); return; } memset(tr->waitfor, 0, tr->waitfor_alloc * sizeof(tr)); /* copy then free old set of pointers */ for (i = 0; i < tr->waitfor_count; i++) tr->waitfor[i] = old_waitfor[i]; if (old_waitfor) free(old_waitfor); } tr->waitfor[tr->waitfor_count++] = granted_lkb->trans; granted_lkb->trans->others_waiting_on_us++; waiting_lkb->waitfor_trans = granted_lkb->trans; } /* for each trans, for each waiting lock, go to rsb of the lock, find granted locks on that rsb, then find the trans the granted lock belongs to, add that trans to our waitfor list */ static void create_waitfor_graph(struct lockspace *ls) { struct dlm_lkb *waiting_lkb, *granted_lkb; struct dlm_rsb *r; struct trans *tr; int depend_count = 0; list_for_each_entry(tr, &ls->transactions, list) { list_for_each_entry(waiting_lkb, &tr->locks, trans_list) { if (waiting_lkb->lock.status == DLM_LKSTS_GRANTED) continue; /* waiting_lkb status is CONVERT or WAITING */ r = waiting_lkb->rsb; list_for_each_entry(granted_lkb, &r->locks, list) { if (granted_lkb->lock.status==DLM_LKSTS_WAITING) continue; /* granted_lkb status is GRANTED or CONVERT */ add_waitfor(ls, waiting_lkb, granted_lkb); depend_count++; } } } log_group(ls, "create_waitfor_graph: depend_count %d", depend_count); } /* Assume a transaction that's not waiting on any locks will complete, release all the locks it currently holds, and exit. Other transactions that were blocked waiting on the removed transaction's now-released locks may now be unblocked, complete, release all held locks and exit. Repeat this until no more transactions can be removed. If there are transactions remaining, then they are deadlocked. */ static void remove_waitfor(struct trans *tr, struct trans *remove_tr) { int i; for (i = 0; i < tr->waitfor_alloc; i++) { if (!tr->waitfor_count) break; if (!tr->waitfor[i]) continue; if (tr->waitfor[i] == remove_tr) { tr->waitfor[i] = NULL; tr->waitfor_count--; remove_tr->others_waiting_on_us--; } } } /* remove_tr is not waiting for anything, assume it completes and goes away and remove it from any other transaction's waitfor list */ static void remove_trans(struct lockspace *ls, struct trans *remove_tr) { struct trans *tr; list_for_each_entry(tr, &ls->transactions, list) { if (tr == remove_tr) continue; if (!remove_tr->others_waiting_on_us) break; remove_waitfor(tr, remove_tr); } if (remove_tr->others_waiting_on_us) log_group(ls, "trans %llx removed others waiting %d", (unsigned long long)remove_tr->xid, remove_tr->others_waiting_on_us); } static int reduce_waitfor_graph(struct lockspace *ls) { struct trans *tr, *safe; int blocked = 0; int removed = 0; list_for_each_entry_safe(tr, safe, &ls->transactions, list) { if (tr->waitfor_count) { blocked++; continue; } remove_trans(ls, tr); list_del(&tr->list); if (tr->waitfor) free(tr->waitfor); free(tr); removed++; } log_group(ls, "reduce_waitfor_graph: %d blocked, %d removed", blocked, removed); return removed; } static void reduce_waitfor_graph_loop(struct lockspace *ls) { int removed; while (1) { removed = reduce_waitfor_graph(ls); if (!removed) break; } } static struct trans *find_trans_to_cancel(struct lockspace *ls) { struct trans *tr; list_for_each_entry(tr, &ls->transactions, list) { if (!tr->others_waiting_on_us) continue; return tr; } return NULL; } static void cancel_trans(struct lockspace *ls) { struct trans *tr; struct dlm_lkb *lkb; int removed; tr = find_trans_to_cancel(ls); if (!tr) { log_group(ls, "cancel_trans: no trans found"); return; } list_for_each_entry(lkb, &tr->locks, trans_list) { if (lkb->lock.status == DLM_LKSTS_GRANTED) continue; send_cancel_lock(ls, tr, lkb); /* When this canceled trans has multiple locks all blocked by locks held by one other trans, that other trans is only added to tr->waitfor once, and only one of these waiting locks will have waitfor_trans set. So, the lkb with non-null waitfor_trans was the first one responsible for adding waitfor_trans to tr->waitfor. We could potentially forget about keeping track of lkb-> waitfor_trans, forget about calling remove_waitfor() here and just set tr->waitfor_count = 0 after this loop. The loss would be that waitfor_trans->others_waiting_on_us would not get decremented. */ if (lkb->waitfor_trans) remove_waitfor(tr, lkb->waitfor_trans); } /* this shouldn't happen, if it does something's not working right */ if (tr->waitfor_count) { log_group(ls, "cancel_trans: %llx non-zero waitfor_count %d", (unsigned long long)tr->xid, tr->waitfor_count); } /* this should now remove the canceled trans since it now has a zero waitfor_count */ removed = reduce_waitfor_graph(ls); if (!removed) log_group(ls, "canceled trans not removed from graph"); /* now call reduce_waitfor_graph() in another loop and it should completely reduce */ } static void dump_trans(struct lockspace *ls, struct trans *tr) { struct dlm_lkb *lkb; struct trans *wf; int i; log_group(ls, "trans xid %llx waitfor_count %d others_waiting_on_us %d", (unsigned long long)tr->xid, tr->waitfor_count, tr->others_waiting_on_us); log_group(ls, "locks:"); list_for_each_entry(lkb, &tr->locks, trans_list) { log_group(ls, " %s: id %08x gr %s rq %s pid %u:%u \"%s\"", status_str(lkb->lock.status), lkb->lock.id, dlm_mode_str(lkb->lock.grmode), dlm_mode_str(lkb->lock.rqmode), lkb->home, lkb->lock.ownpid, lkb->rsb->name); } if (!tr->waitfor_count) return; log_group(ls, "waitfor:"); for (i = 0; i < tr->waitfor_alloc; i++) { if (!tr->waitfor[i]) continue; wf = tr->waitfor[i]; log_group(ls, " xid %llx", (unsigned long long)wf->xid); } } static void dump_all_trans(struct lockspace *ls) { struct trans *tr; log_group(ls, "Transaction dump:"); list_for_each_entry(tr, &ls->transactions, list) dump_trans(ls, tr); } static void find_deadlock(struct lockspace *ls) { if (list_empty(&ls->resources)) { log_group(ls, "no deadlock: no resources"); goto out; } if (!list_empty(&ls->transactions)) { log_group(ls, "transactions list should be empty"); goto out; } dump_resources(ls); create_trans_list(ls); create_waitfor_graph(ls); dump_all_trans(ls); reduce_waitfor_graph_loop(ls); if (list_empty(&ls->transactions)) { log_group(ls, "no deadlock: all transactions reduced"); goto out; } log_group(ls, "found deadlock"); dump_all_trans(ls); cancel_trans(ls); reduce_waitfor_graph_loop(ls); if (list_empty(&ls->transactions)) { log_group(ls, "resolved deadlock with cancel"); goto out; } log_error("deadlock resolution failed"); dump_all_trans(ls); out: send_cycle_end(ls); } dlm-4.3.0/dlm_controld/dlm.conf.5000066400000000000000000000200711461150666200165470ustar00rootroot00000000000000.TH DLM.CONF 5 2012-04-09 dlm dlm .SH NAME dlm.conf \- dlm_controld configuration file .SH SYNOPSIS .B /etc/dlm/dlm.conf .SH DESCRIPTION The configuration options in dlm.conf mirror the dlm_controld command line options. The config file additionally allows advanced fencing and lockspace configuration that are not supported on the command line. .SH Command line equivalents If an option is specified on the command line and in the config file, the command line setting overrides the config file setting. See .BR dlm_controld (8) for descriptions and dlm_controld -h for defaults. Format: setting=value Example: log_debug=1 .br post_join_delay=10 .br protocol=tcp Options: daemon_debug(*) .br log_debug(*) .br protocol .br mark .br debug_logfile(*) .br enable_plock .br plock_debug(*) .br plock_rate_limit(*) .br plock_ownership .br drop_resources_time(*) .br drop_resources_count(*) .br drop_resources_age(*) .br post_join_delay(*) .br enable_fencing .br enable_concurrent_fencing .br enable_startup_fencing .br enable_quorum_fencing(*) .br enable_quorum_lockspace(*) .br repeat_failed_fencing(*) .br enable_helper .br Options with (*) can be reloaded, see Reload config. .SH Reload config Some dlm.conf settings can be changed while dlm_controld is running using dlm_tool reload_config. Edit dlm.conf, adding, removing, commenting or changing values, then run dlm_tool reload_config to apply the changes in dlm_controld. dlm_tool dump_config will show the new settings. .SH Fencing A fence device definition begins with a .B device line, followed by a number of .B connect lines, one for each node connected to the device. A blank line separates device definitions. Devices are used in the order they are listed. The .B device key word is followed by a unique .IR dev_name , the .I agent program to be used, and .IR args , which are agent arguments specific to the device. The .B connect key word is followed by the .I dev_name of the device section, the node ID of the connected node in the format .BI node= nodeid and .IR args , which are agent arguments specific to the node for the given device. The format of .I args is key=val on both device and connect lines, each pair separated by a space, e.g. key1=val1 key2=val2 key3=val3. Format: .B device .I " dev_name" .I agent [args] .br .B connect .I dev_name .BI node= nodeid [args] .br .B connect .I dev_name .BI node= nodeid [args] .br .B connect .I dev_name .BI node= nodeid [args] .br Example: device foo fence_foo ipaddr=1.1.1.1 login=x password=y .br connect foo node=1 port=1 .br connect foo node=2 port=2 .br connect foo node=3 port=3 device bar fence_bar ipaddr=2.2.2.2 login=x password=y .br connect bar node=1 port=1 .br connect bar node=2 port=2 .br connect bar node=3 port=3 .SS Parallel devices Some devices, like dual power or dual path, must all be turned off in parallel for fencing to succeed. To define multiple devices as being parallel to each other, use the same base dev_name with different suffixes and a colon separator between base name and suffix. Format: .B device .IR " dev_name" :1 .IR agent [args] .br .B connect .IR dev_name :1 .BI node= nodeid [args] .br .B connect .IR dev_name :1 .BI node= nodeid [args] .br .B connect .IR dev_name :1 .BI node= nodeid [args] .B device .IR " dev_name" :2 .I agent [args] .br .B connect .IR dev_name :2 .BI node= nodeid [args] .br .B connect .IR dev_name :2 .BI node= nodeid [args] .br .B connect .IR dev_name :2 .BI node= nodeid [args] Example: device foo:1 fence_foo ipaddr=1.1.1.1 login=x password=y .br connect foo:1 node=1 port=1 .br connect foo:2 node=2 port=2 .br connect foo:3 node=3 port=3 device foo:2 fence_foo ipaddr=5.5.5.5 login=x password=y .br connect foo:2 node=1 port=1 .br connect foo:2 node=2 port=2 .br connect foo:2 node=3 port=3 .SS Unfencing A node may sometimes need to "unfence" itself when starting. The unfencing command reverses the effect of a previous fencing operation against it. An example would be fencing that disables a port on a SAN switch. A node could use unfencing to re-enable its switch port when starting up after rebooting. (Care must be taken to ensure it's safe for a node to unfence itself. A node often needs to be cleanly rebooted before unfencing itself.) To specify that a node should unfence itself for a given .BR device, the .B unfence line is added after the .B connect lines. Format: .B device .I " dev_name" .I agent [args] .br .B connect .I dev_name .BI node= nodeid [args] .br .B connect .I dev_name .BI node= nodeid [args] .br .B connect .I dev_name .BI node= nodeid [args] .br .BI "unfence " dev_name Example: device foo fence_foo ipaddr=1.1.1.1 login=x password=y .br connect foo node=1 port=1 .br connect foo node=2 port=2 .br connect foo node=3 port=3 .br unfence foo .SS Simple devices In some cases, a single fence device is used for all nodes, and it requires no node-specific args. This would typically be a "bridge" fence device in which an agent is passing a fence request to another subsystem to handle. (Note that a "node=nodeid" arg is always automatically included in agent args, so a node-specific nodeid is always present to minimally identify the victim.) In such a case, a simplified, single-line fence configuration is possible, with format: .B fence_all .I agent [args] Example: fence_all dlm_stonith A fence_all configuration is not compatible with a fence device configuration (above). Unfencing can optionally be applied with: .B fence_all .I agent [args] .br .B unfence_all .SH Lockspace configuration A lockspace definition begins with a .B lockspace line, followed by a number of .B master lines. A blank line separates lockspace definitions. Format: .B lockspace .I ls_name [ls_args] .br .B master .I " ls_name" .BI node= nodeid [node_args] .br .B master .I " ls_name" .BI node= nodeid [node_args] .br .B master .I " ls_name" .BI node= nodeid [node_args] .br .SS Disabling resource directory Lockspaces usually use a resource directory to keep track of which node is the master of each resource. The dlm can operate without the resource directory, though, by statically assigning the master of a resource using a hash of the resource name. To enable, set the per-lockspace .B nodir option to 1. Example: lockspace foo nodir=1 .SS Lock-server configuration The nodir setting can be combined with node weights to create a configuration where select node(s) are the master of all resources/locks. These master nodes can be viewed as "lock servers" for the other nodes. Example of nodeid 1 as master of all resources: lockspace foo nodir=1 .br master foo node=1 Example of nodeid's 1 and 2 as masters of all resources: lockspace foo nodir=1 .br master foo node=1 .br master foo node=2 Lock management will be partitioned among the available masters. There can be any number of masters defined. The designated master nodes will master all resources/locks (according to the resource name hash). When no masters are members of the lockspace, then the nodes revert to the common fully-distributed configuration. Recovery is faster, with little disruption, when a non-master node joins/leaves. There is no special mode in the dlm for this lock server configuration, it's just a natural consequence of combining the "nodir" option with node weights. When a lockspace has master nodes defined, the master has a default weight of 1 and all non-master nodes have weight of 0. An explicit non-zero .B weight can also be assigned to master nodes, e.g. lockspace foo nodir=1 .br master foo node=1 weight=2 .br master foo node=2 weight=1 In which case node 1 will master 2/3 of the total resources and node 2 will master the other 1/3. .SS Node configuration Node configurations can be set by the node keyword followed of key-value pairs. .B Keys: .B mark The mark key can be used to set a specific mark value which is then used by the in-kernel DLM socket creation. This can be used to match for DLM specific packets for e.g. routing. Example of setting a per socket value for nodeid 1 and a mark value of 42: node id=1 mark=42 For local nodes this value doesn't have any effect. .SH SEE ALSO .BR dlm_controld (8), .BR dlm_tool (8) dlm-4.3.0/dlm_controld/dlm_controld.8000066400000000000000000000045711461150666200175410ustar00rootroot00000000000000.TH DLM_CONTROLD 8 2012-04-05 dlm dlm .SH NAME dlm_controld \- dlm cluster control daemon .SH SYNOPSIS .B dlm_controld [OPTIONS] .SH DESCRIPTION The kernel dlm requires a user daemon to manage lockspace membership. dlm_controld manages lockspace membership using corosync cpg groups, and translates membership changes into dlm kernel recovery events. dlm_controld also manages posix locks for cluster file systems using the dlm. .SH OPTIONS Command line options override a corresponding setting in .BR dlm.conf (5). .br For default settings, see dlm_controld -h. .B --daemon_debug | -D enable debugging to stderr and don't fork .B --foreground don't fork .B --log_debug | -K enable kernel dlm debugging messages .B --protocol | -r .I str dlm kernel lowcomms protocol: tcp, sctp, detect .B --debug_logfile | -L write debugging to log file .B --enable_plock | -p 0|1 enable/disable posix lock support for cluster fs .B --plock_debug | -P enable plock debugging .B --plock_rate_limit | -l .I int limit rate of plock operations (0 for none) .B --plock_ownership | -o 0|1 enable/disable plock ownership .B --drop_resources_time | -t .I int plock ownership drop resources time (milliseconds) .B --drop_resources_count | -c .I int plock ownership drop resources count .B --drop_resources_age | -a .I int plock ownership drop resources age (milliseconds) .B --post_join_delay | -j .I int seconds to delay fencing after cluster join .B --enable_fencing | -f 0|1 enable/disable fencing .B --enable_concurrent_fencing 0|1 enable/disable concurrent fencing .B --enable_startup_fencing | -s 0|1 enable/disable startup fencing .B --enable_quorum_fencing | -q 0|1 enable/disable quorum requirement for fencing .B --enable_quorum_lockspace 0|1 enable/disable quorum requirement for lockspace operations .B --enable_helper 0|1 enable/disable helper process for running commands .B --repeat_failed_fencing 0|1 enable/disable retrying after fencing fails .B --fence_all .I str fence all nodes with this agent .B --unfence_all enable unfencing self with fence_all agent .B --help | -h print this help, then exit .B --version | -V Print program version information, then exit .SH SEE ALSO .BR dlm_tool (8), .BR dlm.conf (5) dlm-4.3.0/dlm_controld/dlm_controld.h000066400000000000000000000036361461150666200176220ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DLM_CONTROLD_DOT_H__ #define __DLM_CONTROLD_DOT_H__ /* This defines the interface between dlm_controld and libdlmcontrol, and should only be used by libdlmcontrol. */ #define DLMC_SOCK_PATH "dlmc_sock" #define DLMC_QUERY_SOCK_PATH "dlmc_query_sock" #define DLMC_MAGIC 0xD13CD13C #define DLMC_VERSION 0x00010001 #define DLMC_CMD_DUMP_DEBUG 1 #define DLMC_CMD_DUMP_PLOCKS 2 #define DLMC_CMD_LOCKSPACE_INFO 3 #define DLMC_CMD_NODE_INFO 4 #define DLMC_CMD_LOCKSPACES 5 #define DLMC_CMD_LOCKSPACE_NODES 6 #define DLMC_CMD_FS_REGISTER 7 #define DLMC_CMD_FS_UNREGISTER 8 #define DLMC_CMD_FS_NOTIFIED 9 #define DLMC_CMD_DEADLOCK_CHECK 10 #define DLMC_CMD_DUMP_LOG_PLOCK 11 #define DLMC_CMD_FENCE_ACK 12 #define DLMC_CMD_DUMP_STATUS 13 #define DLMC_CMD_DUMP_CONFIG 14 #define DLMC_CMD_RUN_START 15 #define DLMC_CMD_RUN_CHECK 16 #define DLMC_CMD_DUMP_RUN 17 #define DLMC_CMD_RELOAD_CONFIG 18 #define DLMC_CMD_SET_CONFIG 19 struct dlmc_header { unsigned int magic; unsigned int version; unsigned int command; unsigned int option; unsigned int len; int data; /* embedded command-specific data, for convenience */ int flags; int unsued2; char name[DLM_LOCKSPACE_LEN]; /* no terminating null space */ }; #define DLMC_STATE_MAXSTR 4096 #define DLMC_STATE_MAXBIN 4096 #define DLMC_STATE_DAEMON 1 #define DLMC_STATE_DAEMON_NODE 2 #define DLMC_STATE_STARTUP_NODE 3 #define DLMC_STATE_RUN 4 struct dlmc_state { uint32_t type; /* DLMC_STATE_ */ uint32_t flags; int32_t nodeid; uint32_t data32; uint32_t data64; uint32_t str_len; uint32_t bin_len; }; struct dlmc_run_check_state { uint32_t check_status; }; #endif dlm-4.3.0/dlm_controld/dlm_daemon.h000066400000000000000000000370511461150666200172370ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DLM_DAEMON_DOT_H__ #define __DLM_DAEMON_DOT_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libdlmcontrol.h" #include "dlm_controld.h" #include "fence_config.h" #include "node_config.h" #include "list.h" #include "rbtree.h" #include "linux_endian.h" #ifndef EXTERN #define EXTERN extern #else #undef EXTERN #define EXTERN #endif #define DAEMON_NAME "dlm_controld" /* TODO: get CONFDIR, LOGDIR, RUNDIR from build */ #define SYS_VARDIR "/var" #define SYS_RUNDIR SYS_VARDIR "/run" #define SYS_LOGDIR SYS_VARDIR "/log" #define RUNDIR SYS_RUNDIR "/dlm_controld" #define LOGDIR SYS_LOGDIR "/dlm_controld" #define CONFDIR "/etc/dlm" #define RUN_FILE_NAME "dlm_controld.pid" #define LOG_FILE_NAME "dlm_controld.log" #define CONF_FILE_NAME "dlm.conf" #define RUN_FILE_PATH RUNDIR "/" RUN_FILE_NAME #define LOG_FILE_PATH LOGDIR "/" LOG_FILE_NAME #define CONF_FILE_PATH CONFDIR "/" CONF_FILE_NAME #define DEFAULT_LOG_MODE LOG_MODE_OUTPUT_FILE | LOG_MODE_OUTPUT_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL4 #define DEFAULT_SYSLOG_PRIORITY LOG_INFO #define DEFAULT_LOGFILE_PRIORITY LOG_INFO #define DEFAULT_LOGFILE LOG_FILE_PATH #define DEFAULT_NETLINK_RCVBUF (2 * 1024 * 1024) enum { no_arg = 0, req_arg_bool = 1, req_arg_int = 2, req_arg_str = 3, req_arg_uint = 4, }; enum { daemon_debug_ind = 0, foreground_ind, log_debug_ind, protocol_ind, port_ind, debug_logfile_ind, mark_ind, enable_fscontrol_ind, enable_plock_ind, plock_debug_ind, plock_rate_limit_ind, plock_ownership_ind, drop_resources_time_ind, drop_resources_count_ind, drop_resources_age_ind, post_join_delay_ind, enable_fencing_ind, enable_concurrent_fencing_ind, enable_startup_fencing_ind, repeat_failed_fencing_ind, enable_quorum_fencing_ind, enable_quorum_lockspace_ind, enable_helper_ind, help_ind, version_ind, dlm_options_max, }; struct dlm_option { const char *name; char letter; int req_arg; char reload; char dynamic; const char *desc; int use_int; char *use_str; unsigned int use_uint; int default_int; const char *default_str; unsigned int default_uint; int cli_set; int cli_int; char *cli_str; unsigned int cli_uint; int file_set; int file_int; char *file_str; unsigned int file_uint; int dynamic_set; int dynamic_int; char *dynamic_str; unsigned int dynamic_uint; }; EXTERN struct dlm_option dlm_options[dlm_options_max]; #define opt(x) dlm_options[x].use_int #define opts(x) dlm_options[x].use_str #define optu(x) dlm_options[x].use_uint /* DLM_LOCKSPACE_LEN: maximum lockspace name length, from linux/dlmconstants.h. Copied in libdlm.h so apps don't need to include the kernel header. The libcpg limit is larger at CPG_MAX_NAME_LENGTH 128. Our cpg name includes a "dlm:" prefix before the lockspace name. */ /* Maximum members of a ls, should match CPG_MEMBERS_MAX in corosync/cpg.h. There are no max defines in dlm-kernel for lockspace members. */ #define MAX_NODES 128 /* Maximum number of IP addresses per node, when using SCTP and multi-ring in corosync In dlm-kernel this is DLM_MAX_ADDR_COUNT, currently 3. */ #define MAX_NODE_ADDRESSES 4 #define PROTO_TCP 0 #define PROTO_SCTP 1 #define PROTO_DETECT 2 EXTERN int daemon_quit; EXTERN int cluster_down; EXTERN int poll_lockspaces; EXTERN unsigned int retry_fencing; EXTERN int daemon_fence_allow; EXTERN int poll_fs; EXTERN int poll_ignore_plock; EXTERN int poll_drop_plock; EXTERN int plock_fd; EXTERN int plock_ci; EXTERN struct list_head lockspaces; EXTERN int cluster_quorate; EXTERN int cluster_two_node; EXTERN uint64_t fence_delay_begin; EXTERN uint64_t cluster_quorate_monotime; EXTERN uint64_t cluster_joined_monotime; EXTERN uint64_t cluster_joined_walltime; EXTERN uint64_t cluster_ringid_seq; EXTERN char cluster_name[DLM_LOCKSPACE_LEN+1]; EXTERN int our_nodeid; EXTERN uint32_t control_minor; EXTERN uint32_t monitor_minor; EXTERN uint32_t plock_minor; EXTERN struct fence_device fence_all_device; EXTERN struct list_head run_ops; #define LOG_DUMP_SIZE DLMC_DUMP_SIZE #define LOG_PLOCK 0x00010000 #define LOG_NONE 0x00001111 __attribute__ (( format( printf, 3, 4 ) )) void log_level(const char *name_in, uint32_t level_in, const char *fmt, ...); #define log_error(fmt, args...) log_level(NULL, LOG_ERR, fmt, ##args) #define log_debug(fmt, args...) log_level(NULL, LOG_DEBUG, fmt, ##args) #define log_erros(ls, fmt, args...) log_level((ls)->name, LOG_ERR, fmt, ##args) #define log_group(ls, fmt, args...) log_level((ls)->name, LOG_DEBUG, fmt, ##args) #define log_plock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_NONE, fmt, ##args) #define log_dlock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_DEBUG, fmt, ##args) #define log_elock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_ERR, fmt, ##args) /* dlm_header types */ enum { DLM_MSG_PROTOCOL = 1, DLM_MSG_START, DLM_MSG_PLOCK, DLM_MSG_PLOCK_OWN, DLM_MSG_PLOCK_DROP, DLM_MSG_PLOCK_SYNC_LOCK, DLM_MSG_PLOCK_SYNC_WAITER, DLM_MSG_PLOCKS_DONE, DLM_MSG_PLOCKS_DATA, DLM_MSG_DEADLK_CYCLE_START, DLM_MSG_DEADLK_CYCLE_END, DLM_MSG_DEADLK_CHECKPOINT_READY, DLM_MSG_DEADLK_CANCEL_LOCK, DLM_MSG_FENCE_RESULT, DLM_MSG_FENCE_CLEAR, DLM_MSG_RUN_REQUEST, DLM_MSG_RUN_REPLY, DLM_MSG_RUN_CANCEL, }; /* dlm_header flags */ #define DLM_MFLG_JOINING 1 /* accompanies start, we are joining */ #define DLM_MFLG_HAVEPLOCK 2 /* accompanies start, we have plock state */ #define DLM_MFLG_NACK 4 /* accompanies start, prevent wrong match when two outstanding changes are the same */ struct dlm_header { uint16_t version[3]; uint16_t type; /* DLM_MSG_ */ uint32_t nodeid; /* sender */ uint32_t to_nodeid; /* recipient, 0 for all */ uint32_t global_id; /* global unique id for this lockspace */ uint32_t flags; /* DLM_MFLG_ */ uint32_t msgdata; /* in-header payload depends on MSG type; lkid for deadlock, seq for lockspace membership */ uint32_t msgdata2; /* second MSG-specific data */ uint64_t pad; }; struct lockspace { struct list_head list; char name[DLM_LOCKSPACE_LEN+1]; uint32_t global_id; /* dlm.conf config */ int nodir; int master_count; int master_nodeid[MAX_NODES]; int master_weight[MAX_NODES]; /* lockspace membership stuff */ cpg_handle_t cpg_handle; struct cpg_ring_id cpg_ringid; int cpg_ringid_wait; int cpg_client; int cpg_fd; int joining; int leaving; int kernel_stopped; int fs_registered; int wait_debug; /* for status/debugging */ uint32_t wait_retry; /* for debug rate limiting */ uint32_t change_seq; uint32_t started_count; struct change *started_change; struct list_head changes; struct list_head node_history; /* plock stuff */ int plock_data_node; int need_plocks; int save_plocks; int disable_plock; uint32_t recv_plocks_data_count; struct list_head saved_messages; struct list_head plock_resources; struct rb_root plock_resources_root; time_t last_plock_time; struct timeval drop_resources_last; #if 0 /* deadlock stuff */ int deadlk_low_nodeid; struct list_head deadlk_nodes; uint64_t deadlk_ckpt_handle; int deadlk_confchg_init; struct list_head transactions; struct list_head resources; struct timeval cycle_start_time; struct timeval cycle_end_time; struct timeval last_send_cycle_start; int cycle_running; int all_checkpoints_ready; #endif }; /* run uuid len doesn't use the full lockspace len */ #define RUN_UUID_LEN DLM_LOCKSPACE_LEN #define RUN_COMMAND_LEN DLMC_RUN_COMMAND_LEN /* 1024 */ #define MAX_AV_COUNT 32 #define ONE_ARG_LEN 256 struct run_info { int dest_nodeid; int start_nodeid; int local_pid; int local_result; int need_replies; int reply_count; int fail_count; uint32_t flags; }; struct node_run_result { int nodeid; int result; int replied; }; struct run { struct list_head list; struct run_info info; char uuid[RUN_UUID_LEN]; char command[RUN_COMMAND_LEN]; struct node_run_result node_results[MAX_NODES]; int node_count; }; struct run_request { struct dlm_header header; struct run_info info; char uuid[RUN_UUID_LEN]; char command[RUN_COMMAND_LEN]; }; struct run_reply { struct dlm_header header; struct run_info info; char uuid[RUN_UUID_LEN]; }; /* action.c */ int set_sysfs_control(char *name, int val); int set_sysfs_event_done(char *name, int val); int set_sysfs_id(char *name, uint32_t id); int set_sysfs_nodir(char *name, int val); int set_configfs_members(struct lockspace *ls, char *name, int new_count, int *new_members, int renew_count, int *renew_members); int add_configfs_node(int nodeid, char *addr, int addrlen, int local, uint32_t mark); void del_configfs_node(int nodeid); void clear_configfs(void); int setup_configfs_options(void); int setup_configfs_members(void); int check_uncontrolled_lockspaces(void); int setup_misc_devices(void); int path_exists(const char *path); int set_configfs_opt(const char *name, char *str, int num); /* config.c */ void set_opt_file(int update); int get_weight(struct lockspace *ls, int nodeid); void setup_lockspace_config(struct lockspace *ls); void set_opt_online(char *cmd_str, int cmd_len); /* cpg.c */ void process_lockspace_changes(void); void process_fencing_changes(void); int dlm_join_lockspace(struct lockspace *ls); int dlm_leave_lockspace(struct lockspace *ls); void update_flow_control_status(void); int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node); int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace); int set_lockspaces(int *count, struct dlmc_lockspace **lss_out); int set_lockspace_nodes(struct lockspace *ls, int option, int *node_count, struct dlmc_node **nodes_out); int set_fs_notified(struct lockspace *ls, int nodeid); void cpg_stop_kernel(struct lockspace *ls); /* daemon_cpg.c */ void init_daemon(void); void fence_ack_node(int nodeid); void add_startup_node(int nodeid); const char *reason_str(int reason); const char *msg_name(int type); void dlm_send_message(struct lockspace *ls, char *buf, int len); int dlm_send_message_daemon(char *buf, int len); void dlm_header_in(struct dlm_header *hd); int dlm_header_validate(struct dlm_header *hd, int nodeid); int fence_node_time(int nodeid, uint64_t *last_fenced); int fence_in_progress(int *in_progress); int setup_cpg_daemon(void); void close_cpg_daemon(void); void process_cpg_daemon(int ci); void set_protocol_stateful(void); int set_protocol(void); void send_state_daemon_nodes(int fd); void send_state_daemon(int fd); void send_state_startup_nodes(int fd); int receive_run_reply(struct dlm_header *hd, int len); int receive_run_request(struct dlm_header *hd, int len); int send_run_request(struct run *run, struct run_request *req); int send_run_reply(struct run *run, struct run_reply *rep); void log_config(const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); void log_ringid(const char *name, struct cpg_ring_id *ringid, const uint32_t *member_list, size_t member_list_entries); /* deadlock.c */ void setup_deadlock(void); void send_cycle_start(struct lockspace *ls); void receive_checkpoint_ready(struct lockspace *ls, struct dlm_header *hd, int len); void receive_cycle_start(struct lockspace *ls, struct dlm_header *hd, int len); void receive_cycle_end(struct lockspace *ls, struct dlm_header *hd, int len); void receive_cancel_lock(struct lockspace *ls, struct dlm_header *hd, int len); void deadlk_confchg(struct lockspace *ls, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); /* main.c */ int do_read(int fd, void *buf, size_t count); int do_write(int fd, void *buf, size_t count); uint64_t monotime(void); void client_dead(int ci); int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)); int client_fd(int ci); void client_ignore(int ci, int fd); void client_back(int ci, int fd); struct lockspace *find_ls(const char *name); struct lockspace *find_ls_id(uint32_t id); const char *dlm_mode_str(int mode); void cluster_dead(int ci); struct dlm_option *get_dlm_option(char *name); int get_ind_name(char *s); struct run *find_run(char *uuid_str); void clear_run(struct run *run); void send_helper_run_request(struct run_request *req); /* member.c */ int setup_cluster(void); void close_cluster(void); void process_cluster(int ci); void update_cluster(void); uint64_t cluster_add_time(int nodeid); int is_cluster_member(uint32_t nodeid); int setup_cluster_cfg(void); void close_cluster_cfg(void); void process_cluster_cfg(int ci); void kick_node_from_cluster(int nodeid); int setup_node_config(void); /* fence.c */ int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, struct fence_config *fc, int reason, int *pid_out); int fence_result(int nodeid, int pid, int *result); int unfence_node(int nodeid); /* netlink.c */ int setup_netlink(void); void process_netlink(int ci); /* plock.c */ int setup_plocks(void); void close_plocks(void); void process_plocks(int ci); void drop_resources_all(void); int limit_plocks(void); void receive_plock(struct lockspace *ls, struct dlm_header *hd, int len); void receive_own(struct lockspace *ls, struct dlm_header *hd, int len); void receive_sync(struct lockspace *ls, struct dlm_header *hd, int len); void receive_drop(struct lockspace *ls, struct dlm_header *hd, int len); void process_saved_plocks(struct lockspace *ls); void purge_plocks(struct lockspace *ls, int nodeid, int unmount); int copy_plock_state(struct lockspace *ls, char *buf, int *len_out); void send_all_plocks_data(struct lockspace *ls, uint32_t seq, uint32_t *plocks_data); void receive_plocks_data(struct lockspace *ls, struct dlm_header *hd, int len); void clear_plocks_data(struct lockspace *ls); /* logging.c */ void init_logging(void); void close_logging(void); void copy_log_dump(char *buf, int *len); void copy_log_dump_plock(char *buf, int *len); void set_logfile_priority(void); /* crc.c */ uint32_t cpgname_to_crc(const char *data, int len); /* helper.c */ int run_helper(int in_fd, int out_fd, int log_stderr); #endif dlm-4.3.0/dlm_controld/fence.c000066400000000000000000000124041461150666200162060ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" static int run_agent(char *agent, char *args, int *pid_out) { int pid, len; int pw_fd = -1; /* parent write file descriptor */ int cr_fd = -1; /* child read file descriptor */ int pfd[2]; len = strlen(args); if (pipe(pfd)) return -errno; cr_fd = pfd[0]; pw_fd = pfd[1]; pid = fork(); if (pid < 0) { close(cr_fd); close(pw_fd); return -errno; } if (pid) { /* parent */ int ret; do { ret = write(pw_fd, args, len); } while (ret < 0 && errno == EINTR); if (ret != len) goto fail; close(cr_fd); close(pw_fd); *pid_out = pid; return 0; } else { /* child */ int c_stdout, c_stderr; /* redirect agent stdout/stderr to /dev/null */ close(1); c_stdout = open("/dev/null", O_WRONLY); if (c_stdout < 0) goto fail; close(2); c_stderr = open("/dev/null", O_WRONLY); if (c_stderr < 0) goto fail; /* redirect agent stdin from parent */ close(0); if (dup(cr_fd) < 0) goto fail; close(cr_fd); close(pw_fd); execlp(agent, agent, NULL); exit(EXIT_FAILURE); } fail: close(cr_fd); close(pw_fd); return -1; } int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, struct fence_config *fc, int reason, int *pid_out) { struct fence_device *dev; char args[FENCE_CONFIG_ARGS_MAX]; char extra[FENCE_CONFIG_NAME_MAX]; int rv, pid = -1; memset(args, 0, sizeof(args)); memset(extra, 0, sizeof(extra)); snprintf(extra, sizeof(extra)-1, "fail_time=%llu\n", (unsigned long long)fail_walltime); dev = fc->dev[fc->pos]; if (!dev) { log_error("fence request %d no config pos %d", nodeid, fc->pos); return -1; } rv = fence_config_agent_args(fc, extra, args); if (rv < 0) { log_error("fence request %d config args error %d", nodeid, rv); return rv; } rv = run_agent(dev->agent, args, &pid); if (rv < 0) { log_error("fence request %d pid %d %s time %llu %s %s run error %d", nodeid, pid, reason_str(reason), (unsigned long long)fail_walltime, dev->name, dev->agent, rv); return rv; } log_error("fence request %d pid %d %s time %llu %s %s", nodeid, pid, reason_str(reason), (unsigned long long)fail_walltime, dev->name, dev->agent); *pid_out = pid; return 0; } /* * if pid has exited, return 0 * result is 0 for success, non-zero for fail * success if pid exited with exit status 0 * fail if pid exited with non-zero exit status, or was terminated by signal * * if pid is running, return -EAGAIN * * other error, return -EXXX */ int fence_result(int nodeid, int pid, int *result) { int status, rv; rv = waitpid(pid, &status, WNOHANG); if (rv < 0) { /* shouldn't happen */ log_error("fence result %d pid %d waitpid %d errno %d", nodeid, pid, rv, errno); return rv; } if (!rv) { /* pid still running, has not changed state */ return -EAGAIN; } if (rv == pid) { /* pid state has changed */ if (WIFEXITED(status)) { /* pid exited with an exit code */ *result = WEXITSTATUS(status); log_error("fence result %d pid %d result %d exit status", nodeid, pid, *result); return 0; } if (WIFSIGNALED(status)) { /* pid terminated due to a signal */ *result = -1; log_error("fence result %d pid %d result %d term signal %d", nodeid, pid, *result, WTERMSIG(status)); return 0; } /* pid state changed but still running */ return -EAGAIN; } /* shouldn't happen */ log_error("fence result %d pid %d waitpid rv %d", nodeid, pid, rv); return -1; } int unfence_node(int nodeid) { struct fence_config config; struct fence_device *dev; char args[FENCE_CONFIG_ARGS_MAX]; char action[FENCE_CONFIG_NAME_MAX]; int rv, i, pid, status; int error = 0; memset(&config, 0, sizeof(config)); rv = fence_config_init(&config, nodeid, (char *)CONF_FILE_PATH); if (rv == -ENOENT) { /* file doesn't exist or doesn't contain config for nodeid */ return 0; } if (rv < 0) { /* there's a problem with the config */ log_error("unfence %d fence_config_init error %d", nodeid, rv); return rv; } memset(action, 0, sizeof(action)); snprintf(action, FENCE_CONFIG_NAME_MAX-1, "action=on\n"); for (i = 0; i < FENCE_CONFIG_DEVS_MAX; i++) { dev = config.dev[i]; if (!dev) break; if (!dev->unfence) continue; config.pos = i; memset(args, 0, sizeof(args)); rv = fence_config_agent_args(&config, action, args); if (rv < 0) { log_error("unfence %d config args error %d", nodeid, rv); error = -1; break; } rv = run_agent(dev->agent, args, &pid); if (rv < 0) { log_error("unfence %d %s %s run error %d", nodeid, dev->name, dev->agent, rv); error = -1; break; } log_error("unfence %d pid %d %s %s", nodeid, pid, dev->name, dev->agent); rv = waitpid(pid, &status, 0); if (rv < 0) { log_error("unfence %d pid %d waitpid errno %d", nodeid, pid, errno); error = -1; break; } if (!WIFEXITED(status) || WEXITSTATUS(status)) { log_error("unfence %d pid %d error status %d", nodeid, pid, status); error = -1; break; } } fence_config_free(&config); return error; } dlm-4.3.0/dlm_controld/fence_config.c000066400000000000000000000216541461150666200175420ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include "fence_config.h" #if 0 Empty new line separates the config for each fence device. - fence_all fence_foo key=val ... Special fence config format that applies to all nodes and allows no per node config parameters. Multiple fence devices (parallel or priority) cannot be used with fence_all. fence_all fence_foo ... unfence_all Apply unfencing to all nodes. - device connect node= General fence config format, allowing per node config parameters. device connect node= unfence Apply unfencing to all nodes connected to this device. - device foo fence_foo ipaddr=1.1.1.1 login=x password=y connect foo node=1 port=1 connect foo node=2 port=2 connect foo node=3 port=3 Simple example of nodes connected to switch ports. If fencing with the device fails, the next device listed for the node, if any, will be tried. - device foo:1 fence_foo ipaddr=1.1.1.1 login=x password=y connect foo:1 node=1 port=1 connect foo:1 node=2 port=2 connect foo:1 node=3 port=3 device foo:2 fence_foo ipaddr=2.2.2.2 login=x password=y connect foo:2 node=1 port=1 connect foo:2 node=2 port=2 connect foo:2 node=3 port=3 Associate two parallel path/power devices that must both succeed for fencing to succeed. Devices have same base name with :1 :2 suffix. - device foo fence_foo ipaddr=1.1.1.1 login=x password=y connect foo node=1 port=1 connect foo node=2 port=2 connect foo node=3 port=3 unfence foo Add unfence line to indicate nodes connected to the device should be unfenced. #endif #define MAX_LINE (FENCE_CONFIG_ARGS_MAX + (3 * FENCE_CONFIG_NAME_MAX)) static unsigned int con_args_nodeid(char *args) { char *k; unsigned int v; int rv; k = strstr(args, "node="); rv = sscanf(k, "node=%u", &v); if (rv != 1) return 0; return v; } static int read_config_section(unsigned int nodeid, FILE *file, char *dev_line, struct fence_device **dev_out, struct fence_connect **con_out) { struct fence_device *dev = NULL; struct fence_connect *con; char line[MAX_LINE]; char unused[FENCE_CONFIG_NAME_MAX]; char agent[FENCE_CONFIG_NAME_MAX]; char dev_name[FENCE_CONFIG_NAME_MAX]; char con_name[FENCE_CONFIG_NAME_MAX]; char dev_args[FENCE_CONFIG_ARGS_MAX]; char con_args[FENCE_CONFIG_ARGS_MAX]; int rv, unfence = 0; if (strlen(dev_line) > MAX_LINE) return -1; memset(dev_name, 0, sizeof(dev_name)); memset(agent, 0, sizeof(agent)); memset(dev_args, 0, sizeof(dev_args)); rv = sscanf(dev_line, "%s %s %s %[^\n]s\n", unused, dev_name, agent, dev_args); if (rv < 3) return -1; while (fgets(line, MAX_LINE, file)) { if (line[0] == '\n') break; if (line[0] == ' ') break; if (line[0] == '#') continue; if (!strncmp(line, "unfence", strlen("unfence"))) { if (!strstr(line, dev_name)) return -EINVAL; unfence = 1; continue; } /* invalid config */ if (strncmp(line, "connect", strlen("connect"))) return -EINVAL; /* once we've found the connect line we want, continue scanning lines until end of section so we pick up an unfence line at the end */ if (dev) continue; memset(con_name, 0, sizeof(con_name)); memset(con_args, 0, sizeof(con_args)); sscanf(line, "%s %s %[^\n]s", unused, con_name, con_args); /* invalid config */ if (strncmp(dev_name, con_name, FENCE_CONFIG_NAME_MAX)) return -EINVAL; /* skip connection for another node */ if (con_args_nodeid(con_args) != nodeid) continue; dev = malloc(sizeof(struct fence_device)); if (!dev) return -ENOMEM; con = malloc(sizeof(struct fence_connect)); if (!con) { free(dev); return -ENOMEM; } memset(dev, 0, sizeof(struct fence_device)); memset(con, 0, sizeof(struct fence_connect)); strncpy(dev->name, dev_name, FENCE_CONFIG_NAME_MAX); dev->name[FENCE_CONFIG_NAME_MAX - 1] = '\0'; strncpy(dev->agent, agent, FENCE_CONFIG_NAME_MAX); dev->agent[FENCE_CONFIG_NAME_MAX - 1] = '\0'; strncpy(dev->args, dev_args, FENCE_CONFIG_ARGS_MAX); dev->args[FENCE_CONFIG_ARGS_MAX - 1] = '\0'; strncpy(con->name, con_name, FENCE_CONFIG_NAME_MAX); con->name[FENCE_CONFIG_NAME_MAX - 1] = '\0'; strncpy(con->args, con_args, FENCE_CONFIG_ARGS_MAX); con->args[FENCE_CONFIG_ARGS_MAX - 1] = '\0'; dev->unfence = unfence; *dev_out = dev; *con_out = con; } if (dev && unfence) dev->unfence = 1; if (dev) return 0; else return -ENOENT; } void fence_config_free(struct fence_config *fc) { struct fence_device *dev; struct fence_connect *con; int i; for (i = 0; i < FENCE_CONFIG_DEVS_MAX; i++) { dev = fc->dev[i]; con = fc->con[i]; if (dev) free(dev); if (con) free(con); } memset(fc, 0, sizeof(struct fence_config)); } int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path) { char line[MAX_LINE]; struct fence_device *dev; struct fence_connect *con; FILE *file; int pos = 0; int rv; fc->nodeid = nodeid; file = fopen(path, "r"); if (!file) return -ENOENT; while (fgets(line, MAX_LINE, file)) { if (line[0] == '#') continue; if (line[0] == '\n') continue; if (!strncmp(line, "fence_all", strlen("fence_all"))) { /* fence_all cannot be used with other fence devices */ if (pos) { rv = -EINVAL; goto out; } dev = malloc(sizeof(struct fence_device)); if (!dev) { rv = -ENOMEM; goto out; } memset(dev, 0, sizeof(struct fence_device)); rv = sscanf(line, "%s %s %[^\n]s\n", dev->name, dev->agent, dev->args); if (rv < 2) { rv = -EINVAL; goto out; } if (fgets(line, MAX_LINE, file) && !strncmp(line, "unfence_all", strlen("unfence_all"))) dev->unfence = 1; fc->dev[0] = dev; fc->pos = 0; rv = 0; goto out; } if (strncmp(line, "device", strlen("device"))) continue; dev = NULL; con = NULL; /* read connect and unfence lines following a device line */ rv = read_config_section(nodeid, file, line, &dev, &con); /* nodeid not listed in this section */ if (rv == -ENOENT) continue; /* an error parsing the section, may be config to free */ if (rv < 0) { if (dev) free(dev); if (con) free(con); goto out; } fc->dev[pos] = dev; fc->con[pos] = con; pos++; } if (!pos) rv = -ENOENT; else rv = 0; out: fclose(file); return rv; } static int same_base_name(struct fence_device *a, struct fence_device *b) { int len, i; len = strlen(a->name); if (len > strlen(b->name)) len = strlen(b->name); for (i = 0; i < len; i++) { if (a->name[i] == ':' && b->name[i] == ':') return 1; if (a->name[i] == b->name[i]) continue; return 0; } return 0; } /* * if next dev is in parallel with last one, * set d,c return 0, else -1 * * two consecutive devs with same basename are parallel */ int fence_config_next_parallel(struct fence_config *fc) { struct fence_device *prev, *next; int d = fc->pos; if (d >= FENCE_CONFIG_DEVS_MAX) return -1; prev = fc->dev[d]; next = fc->dev[d+1]; if (!next) return -1; if (same_base_name(prev, next)) { fc->pos = d+1; return 0; } return -1; } /* * if there's a dev with the next priority, * set d,c return 0, else -1 * * look for another dev with a non-matching basename */ int fence_config_next_priority(struct fence_config *fc) { struct fence_device *prev, *next; int d = fc->pos; int i; if (d >= FENCE_CONFIG_DEVS_MAX) return -1; prev = fc->dev[d]; for (i = d+1; i < FENCE_CONFIG_DEVS_MAX; i++) { next = fc->dev[i]; if (!next) return -1; if (same_base_name(prev, next)) continue; fc->pos = d+1; return 0; } return -1; } int fence_config_agent_args(struct fence_config *fc, char *extra, char *args) { struct fence_device *dev; struct fence_connect *con; char node[FENCE_CONFIG_NAME_MAX]; char *p; int n = 0; int i, len; dev = fc->dev[fc->pos]; con = fc->con[fc->pos]; memset(node, 0, sizeof(node)); snprintf(node, FENCE_CONFIG_NAME_MAX-1, "node=%u\n", fc->nodeid); len = strlen(node); if (dev) len += strlen(dev->args) + 1; /* +1 for \n */ if (con) len += strlen(con->args) + 1; if (extra) len += strlen(extra) + 1; if (len > FENCE_CONFIG_ARGS_MAX - 1) return -1; if (dev && dev->args[0]) { p = dev->args; for (i = 0; i < strlen(dev->args); i++) { if (*p == ' ') args[n++] = '\n'; else args[n++] = *p; p++; } args[n++] = '\n'; } if (con && con->args[0]) { p = con->args; for (i = 0; i < strlen(con->args); i++) { if (*p == ' ') args[n++] = '\n'; else args[n++] = *p; p++; } args[n++] = '\n'; } if (!strstr(args, "node=")) strcat(args, node); if (extra) strcat(args, extra); return 0; } dlm-4.3.0/dlm_controld/fence_config.h000066400000000000000000000036431461150666200175450ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef _FENCE_CONFIG_H_ #define _FENCE_CONFIG_H_ #define FENCE_CONFIG_DEVS_MAX 4 /* max devs per node */ #define FENCE_CONFIG_NAME_MAX 256 /* including terminating \0 */ #define FENCE_CONFIG_ARGS_MAX 4096 /* including terminating \0 */ struct fence_device { char name[FENCE_CONFIG_NAME_MAX]; char agent[FENCE_CONFIG_NAME_MAX]; char args[FENCE_CONFIG_ARGS_MAX]; int unfence; }; struct fence_connect { char name[FENCE_CONFIG_NAME_MAX]; char args[FENCE_CONFIG_ARGS_MAX]; }; /* describes fence config for one node */ struct fence_config { struct fence_device *dev[FENCE_CONFIG_DEVS_MAX]; struct fence_connect *con[FENCE_CONFIG_DEVS_MAX]; unsigned int nodeid; int pos; }; /* * Returns -ENOENT if path does not exist or there is no * config for nodeid in the file. * * Returns -EXYZ if there's a problem with the config. * * Returns 0 if a config was found with no problems. */ int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path); void fence_config_free(struct fence_config *fc); /* * Iterate through fence_config, sets pos to indicate next to try. * Based on two rules: * * - next_parallel is the next device with the same base name * as the current device (base name is name preceding ":") * * - next_priority is the next device without the same base name * as the current device */ int fence_config_next_parallel(struct fence_config *fc); int fence_config_next_priority(struct fence_config *fc); /* * Combine dev->args and con->args, replacing ' ' with '\n'. * Also add "node=nodeid" if "node=" does not already exist. */ int fence_config_agent_args(struct fence_config *fc, char *extra, char *args); #endif dlm-4.3.0/dlm_controld/helper.c000066400000000000000000000224011461150666200164030ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dlm_daemon.h" static int _log_stderr; #define log_helper(fmt, args...) \ do { \ if (_log_stderr) \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)monotime(), ##args); \ } while (0) /* * Restrict the commands that can be run. */ #define CMD_ID_LVCHANGE_REFRESH 1 #define CMD_ID_LVS 2 static int _get_cmd_id(char **av, int av_count) { if ((av_count >= 3) && !strcmp(av[0], "lvm") && !strcmp(av[1], "lvchange") && !strcmp(av[2], "--refresh")) { return CMD_ID_LVCHANGE_REFRESH; } if ((av_count >= 2) && !strcmp(av[0], "lvm") && !strcmp(av[1], "lvs")) { return CMD_ID_LVS; } return 0; } /* * Keep track of running pids mainly because when the process * exits we get the pid, and need to look up the uuid from the * pid to return the uuid/pid/result back to the main daemon. */ #define MAX_RUNNING 32 struct running { char uuid[RUN_UUID_LEN]; int pid; int cmd_id; }; static struct running running_cmds[MAX_RUNNING]; static int running_count; static int _save_running_cmd(char *uuid, int pid, int cmd_id) { int i; for (i = 0; i < MAX_RUNNING; i++) { if (!running_cmds[i].pid) { running_cmds[i].pid = pid; running_cmds[i].cmd_id = cmd_id; memcpy(running_cmds[i].uuid, uuid, RUN_UUID_LEN); running_count++; return 0; } } log_helper("too many running commands"); return -1; } static struct running *_get_running_cmd(int pid) { int i; for (i = 0; i < MAX_RUNNING; i++) { if (running_cmds[i].pid == pid) return &running_cmds[i]; } return NULL; } static struct running *_get_running_uuid(char *uuid) { int i; for (i = 0; i < MAX_RUNNING; i++) { if (!strcmp(running_cmds[i].uuid, uuid)) return &running_cmds[i]; } return NULL; } static void _clear_running_cmd(struct running *running) { running_count--; running->pid = 0; running->cmd_id = 0; memset(running->uuid, 0, RUN_UUID_LEN); } /* runs in child process that was forked by helper */ static void exec_command(char *cmd_str, int out_fd) { char cmd_buf[16]; char arg[ONE_ARG_LEN]; char *av[MAX_AV_COUNT + 1]; /* +1 for NULL */ int av_count = 0; int i, rv, arg_len, cmd_len, cmd_id; for (i = 0; i < MAX_AV_COUNT + 1; i++) av[i] = NULL; if (!cmd_str[0]) return; /* this should already be done, but make sure */ cmd_str[RUN_COMMAND_LEN - 1] = '\0'; memset(&arg, 0, sizeof(arg)); arg_len = 0; cmd_len = strlen(cmd_str); for (i = 0; i < cmd_len; i++) { if (!cmd_str[i]) break; if (av_count == MAX_AV_COUNT) break; if (cmd_str[i] == '\\') { if (i == (cmd_len - 1)) break; i++; if (cmd_str[i] == '\\') { arg[arg_len++] = cmd_str[i]; continue; } if (isspace(cmd_str[i])) { arg[arg_len++] = cmd_str[i]; continue; } else { break; } } if (isalnum(cmd_str[i]) || ispunct(cmd_str[i])) { arg[arg_len++] = cmd_str[i]; } else if (isspace(cmd_str[i])) { if (arg_len) av[av_count++] = strdup(arg); memset(arg, 0, sizeof(arg)); arg_len = 0; } else { break; } } if ((av_count < MAX_AV_COUNT) && arg_len) { av[av_count++] = strdup(arg); } /* for (i = 0; i < MAX_AV_COUNT + 1; i++) { if (!av[i]) break; log_helper("command av[%d] \"%s\"", i, av[i]); } */ cmd_id = _get_cmd_id(av, av_count); /* tell the parent the command we have identified to run */ memset(cmd_buf, 0, sizeof(cmd_buf)); snprintf(cmd_buf, sizeof(cmd_buf), "cmd_id %d", cmd_id); rv = write(out_fd, cmd_buf, sizeof(cmd_buf)); if (rv < 0) log_helper("write cmd_buf from child error %d", rv); close(out_fd); /* if we return before exec, the child does exit(1) (failure) */ if (!cmd_id) return; execvp(av[0], av); } static int read_request(int fd, struct run_request *req) { int rv; retry: rv = read(fd, req, sizeof(struct run_request)); if (rv == -1 && errno == EINTR) goto retry; if (rv != sizeof(struct run_request)) return -1; return 0; } static int send_status(int fd) { struct run_reply rep; int rv; memset(&rep, 0, sizeof(rep)); rv = write(fd, &rep, sizeof(rep)); if (rv == sizeof(rep)) return 0; return -1; } static int send_result(struct running *running, int fd, int pid, int result) { struct run_reply rep; int rv; memset(&rep, 0, sizeof(rep)); rep.header.type = DLM_MSG_RUN_REPLY; memcpy(rep.uuid, running->uuid, RUN_UUID_LEN); rep.info.local_pid = pid; rep.info.local_result = result; rv = write(fd, &rep, sizeof(rep)); if (rv == sizeof(rep)) return 0; return -1; } #define HELPER_STATUS_INTERVAL 30 #define STANDARD_TIMEOUT_MS (HELPER_STATUS_INTERVAL*1000) #define RECOVERY_TIMEOUT_MS 1000 /* run by the child helper process forked by dlm_controld in setup_helper */ int run_helper(int in_fd, int out_fd, int log_stderr) { struct pollfd pollfd; struct run_request req; struct running *running; struct dlm_header *hd = (struct dlm_header *)&req; char cmd_buf[16]; siginfo_t info; unsigned int fork_count = 0; unsigned int done_count = 0; time_t now, last_send, last_good = 0; int timeout = STANDARD_TIMEOUT_MS; int rv, pid, cmd_id; _log_stderr = log_stderr; rv = setgroups(0, NULL); if (rv < 0) log_helper("error clearing helper groups errno %i", errno); memset(&pollfd, 0, sizeof(pollfd)); pollfd.fd = in_fd; pollfd.events = POLLIN; now = monotime(); last_send = now; rv = send_status(out_fd); if (!rv) last_good = now; openlog("dlm_controld", LOG_CONS | LOG_PID, LOG_LOCAL4); while (1) { rv = poll(&pollfd, 1, timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) exit(0); now = monotime(); if (now - last_good >= HELPER_STATUS_INTERVAL && now - last_send >= 2) { last_send = now; rv = send_status(out_fd); if (!rv) last_good = now; } memset(&req, 0, sizeof(req)); if (pollfd.revents & POLLIN) { rv = read_request(in_fd, &req); if (rv) continue; if (hd->type == DLM_MSG_RUN_REQUEST) { int cmd_pipe[2]; if (running_count >= MAX_RUNNING) { log_helper("too many running commands"); exit(1); } /* * Child writes cmd_buf to cmd_pipe, parent reads * cmd_buf from cmd_pipe. cmd_buf contains the * string "cmd_id " where is CMD_ID_NUM * identifying the command being run by the child. */ if (pipe(cmd_pipe)) exit(1); pid = fork(); if (!pid) { close(cmd_pipe[0]); exec_command(req.command, cmd_pipe[1]); exit(1); } close(cmd_pipe[1]); memset(cmd_buf, 0, sizeof(cmd_buf)); cmd_id = 0; rv = read(cmd_pipe[0], cmd_buf, sizeof(cmd_buf)); if (rv < 0) log_helper("helper read child cmd_id error %d", rv); close(cmd_pipe[0]); sscanf(cmd_buf, "cmd_id %d", &cmd_id); _save_running_cmd(req.uuid, pid, cmd_id); fork_count++; log_helper("helper run %s pid %d cmd_id %d running %d fork_count %d done_count %d %s", req.uuid, pid, cmd_id, running_count, fork_count, done_count, req.command); } else if (hd->type == DLM_MSG_RUN_CANCEL) { /* TODO: should we also send kill to the pid? */ if (!(running = _get_running_uuid(req.uuid))) log_helper("no running cmd for cancel uuid"); else { log_helper("cancel running cmd uuid %s pid %d", running->uuid, running->pid); _clear_running_cmd(running); } } } if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) exit(0); /* collect child exits until no more children exist (ECHILD) or none are ready (WNOHANG) */ while (1) { memset(&info, 0, sizeof(info)); rv = waitid(P_ALL, 0, &info, WEXITED | WNOHANG); if ((rv < 0) && (errno == ECHILD)) { /* log_helper("helper no children exist fork_count %d done_count %d", fork_count, done_count); */ timeout = STANDARD_TIMEOUT_MS; } else if (!rv && !info.si_pid) { log_helper("helper no children ready fork_count %d done_count %d", fork_count, done_count); timeout = RECOVERY_TIMEOUT_MS; } else if (!rv && info.si_pid) { done_count++; if (!(running = _get_running_cmd(info.si_pid))) { log_helper("running cmd for pid %d result %d not found", info.si_pid, info.si_status); continue; } else { log_helper("running cmd for pid %d result %d done", info.si_pid, info.si_status); } if (info.si_status) { syslog(LOG_ERR, "%llu run error %s id %d pid %d status %d code %d", (unsigned long long)monotime(), running->uuid, running->cmd_id, running->pid, info.si_status, info.si_code); } send_result(running, out_fd, info.si_pid, info.si_status); _clear_running_cmd(running); continue; } else { log_helper("helper waitid rv %d errno %d fork_count %d done_count %d", rv, errno, fork_count, done_count); } break; } } closelog(); return 0; } dlm-4.3.0/dlm_controld/lib.c000066400000000000000000000442511461150666200157010ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include "dlm_controld.h" #include "libdlmcontrol.h" static int do_read(int fd, void *buf, size_t count) { int rv, off = 0; while (off < count) { rv = read(fd, (char *)buf + off, count - off); if (rv == 0) return -1; if (rv == -1 && errno == EINTR) continue; if (rv == -1) return -1; off += rv; } return 0; } static int do_write(int fd, void *buf, size_t count) { int rv, off = 0; retry: rv = write(fd, (char *)buf + off, count); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) { return rv; } if (rv != count) { count -= rv; off += rv; goto retry; } return 0; } static int do_connect(const char *sock_path) { struct sockaddr_un sun; socklen_t addrlen; int rv, fd; fd = socket(PF_UNIX, SOCK_STREAM, 0); if (fd < 0) goto out; memset(&sun, 0, sizeof(sun)); sun.sun_family = AF_UNIX; strcpy(&sun.sun_path[1], sock_path); addrlen = sizeof(sa_family_t) + strlen(sun.sun_path+1) + 1; rv = connect(fd, (struct sockaddr *) &sun, addrlen); if (rv < 0) { close(fd); fd = rv; } out: return fd; } static inline void init_header_name(struct dlmc_header *h, const char *name, size_t len) { #pragma GCC diagnostic push #if __GNUC__ >= 8 #pragma GCC diagnostic ignored "-Wstringop-truncation" #endif strncpy(h->name, name, len); #pragma GCC diagnostic pop } static void init_header(struct dlmc_header *h, int cmd, char *name, int extra_len) { memset(h, 0, sizeof(struct dlmc_header)); h->magic = DLMC_MAGIC; h->version = DLMC_VERSION; h->len = sizeof(struct dlmc_header) + extra_len; h->command = cmd; if (name) init_header_name(h, name, DLM_LOCKSPACE_LEN); } static char copy_buf[DLMC_DUMP_SIZE]; static int do_dump(int cmd, char *name, char *buf, int *data) { struct dlmc_header h; int fd, rv, len; memset(copy_buf, 0, DLMC_DUMP_SIZE); init_header(&h, cmd, name, 0); *data = 0; fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; memset(&h, 0, sizeof(h)); rv = do_read(fd, &h, sizeof(h)); if (rv < 0) goto out_close; *data = h.data; len = h.len - sizeof(h); if (len <= 0 || len > DLMC_DUMP_SIZE) goto out_close; rv = do_read(fd, copy_buf, len); if (rv < 0) goto out_close; memcpy(buf, copy_buf, len); out_close: close(fd); out: return rv; } int dlmc_dump_debug(char *buf, int *data) { return do_dump(DLMC_CMD_DUMP_DEBUG, NULL, buf, data); } int dlmc_dump_config(char *buf, int *data) { return do_dump(DLMC_CMD_DUMP_CONFIG, NULL, buf, data); } int dlmc_dump_log_plock(char *buf, int *data) { return do_dump(DLMC_CMD_DUMP_LOG_PLOCK, NULL, buf, data); } int dlmc_dump_plocks(char *name, char *buf, int *data) { return do_dump(DLMC_CMD_DUMP_PLOCKS, name, buf, data); } int dlmc_dump_run(char *buf, int *data) { return do_dump(DLMC_CMD_DUMP_RUN, NULL, buf, data); } int dlmc_reload_config(void) { struct dlmc_header h; int fd, rv; init_header(&h, DLMC_CMD_RELOAD_CONFIG, NULL, 0); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); close(fd); out: return rv; } int dlmc_set_config(char *command) { struct dlmc_header h; char *cmdbuf; int fd, rv; cmdbuf = malloc(DLMC_RUN_COMMAND_LEN); if (!cmdbuf) return -1; memset(cmdbuf, 0, DLMC_RUN_COMMAND_LEN); strncpy(cmdbuf, command, DLMC_RUN_COMMAND_LEN-1); init_header(&h, DLMC_CMD_SET_CONFIG, NULL, DLMC_RUN_COMMAND_LEN); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; rv = do_write(fd, cmdbuf, DLMC_RUN_COMMAND_LEN); if (rv < 0) goto out_close; out_close: close(fd); out: free(cmdbuf); return rv; } static int nodeid_compare(const void *va, const void *vb) { const int *a = va; const int *b = vb; return *a - *b; } static void print_str(char *str, int len) { char *p; int i; p = &str[0]; for (i = 0; i < len-1; i++) { if (str[i] == ' ') { str[i] = '\0'; printf(" %s\n", p); p = &str[i+1]; } } if (p) printf(" %s\n", p); } static unsigned int kv(char *str, const char *k) { char valstr[64]; char *p; int i; p = strstr(str, k); if (!p) return 0; p = strstr(p, "="); if (!p) return 0; /* move pointer after '=' */ p++; memset(valstr, 0, 64); for (i = 0; i < 64; i++) { if (*p == ' ') break; if (*p == '\0') break; if (*p == '\n') break; valstr[i] = *p; p++; } return (unsigned int)strtoul(valstr, NULL, 0); } static char *ks(char *str, const char *k) { static char valstr[64]; char *p; int i; p = strstr(str, k); if (!p) return 0; p = strstr(p, "="); if (!p) return 0; /* move pointer after '=' */ p++; memset(valstr, 0, 64); for (i = 0; i < 64; i++) { if (*p == ' ') break; if (*p == '\0') break; if (*p == '\n') break; valstr[i] = *p; p++; } return valstr; } static void print_daemon(struct dlmc_state *st, char *str, char *bin, uint32_t flags) { unsigned int cluster_ringid, daemon_ringid; unsigned int fipu; if (flags & DLMC_STATUS_VERBOSE) { printf("our_nodeid %d\n", st->nodeid); print_str(str, st->str_len); return; } cluster_ringid = kv(str, "cluster_ringid"); daemon_ringid = kv(str, "daemon_ringid"); printf("cluster nodeid %d quorate %u ring seq %u %u\n", st->nodeid, kv(str, "quorate"), cluster_ringid, daemon_ringid); fipu = kv(str, "fence_in_progress_unknown"); printf("daemon now %u fence_pid %u %s\n", kv(str, "monotime"), kv(str, "fence_pid"), fipu ? "fence_init" : ""); } static void format_daemon_node(struct dlmc_state *st, char *str, char *bin, uint32_t flags, char *node_line, char *fence_line) { unsigned int delay_fencing, result_wait, killed; char letter; if (st->type == DLMC_STATE_STARTUP_NODE) letter = 'U'; else if (kv(str, "member")) letter = 'M'; else letter = 'X'; snprintf(node_line, DLMC_STATE_MAXSTR - 1, "node %d %c add %u rem %u fail %u fence %u at %u %u\n", st->nodeid, letter, kv(str, "add_time"), kv(str, "rem_time"), kv(str, "fail_monotime"), kv(str, "fence_monotime"), kv(str, "actor_done"), kv(str, "fence_walltime")); if (!kv(str, "need_fencing")) return; delay_fencing = kv(str, "delay_fencing"); result_wait = kv(str, "fence_result_wait"); killed = kv(str, "killed"); if (delay_fencing) snprintf(fence_line, DLMC_STATE_MAXSTR - 1, "fence %d %s delay actor %u fail %u fence %u now %u%s%s\n", st->nodeid, ks(str, "left_reason"), kv(str, "actor_last"), kv(str, "fail_walltime"), kv(str, "fence_walltime"), (unsigned int)time(NULL), result_wait ? " result_wait" : "", killed ? " killed" : ""); else snprintf(fence_line, DLMC_STATE_MAXSTR - 1, "fence %d %s pid %d actor %u fail %u fence %u now %u%s%s\n", st->nodeid, ks(str, "left_reason"), kv(str, "fence_pid"), kv(str, "actor_last"), kv(str, "fail_walltime"), kv(str, "fence_walltime"), (unsigned int)time(NULL), result_wait ? " result_wait" : "", killed ? " killed" : ""); } #define MAX_SORT 64 int dlmc_print_status(uint32_t flags) { struct dlmc_header h; struct dlmc_state state; struct dlmc_state *st; char maxstr[DLMC_STATE_MAXSTR]; char maxbin[DLMC_STATE_MAXBIN]; char *str; char *bin; int all_count, node_count, fence_count, startup_count; int all_ids[MAX_SORT]; int node_ids[MAX_SORT]; int fence_ids[MAX_SORT]; int startup_ids[MAX_SORT]; char *node_lines[MAX_SORT]; char *fence_lines[MAX_SORT]; char *node_line; char *fence_line; int found_node; int fd, rv; int i, j; init_header(&h, DLMC_CMD_DUMP_STATUS, NULL, 0); fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { printf("cannot connect to dlm_controld\n"); rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) { printf("cannot send to dlm_controld\n"); goto out_close; } st = &state; str = maxstr; bin = maxbin; all_count = 0; node_count = 0; fence_count = 0; startup_count = 0; memset(&all_ids, 0, sizeof(all_ids)); memset(&node_ids, 0, sizeof(node_ids)); memset(&fence_ids, 0, sizeof(fence_ids)); memset(&startup_ids, 0, sizeof(startup_ids)); memset(node_lines, 0, sizeof(node_lines)); memset(fence_lines, 0, sizeof(fence_lines)); while (1) { memset(&state, 0, sizeof(state)); memset(maxstr, 0, sizeof(maxstr)); memset(maxbin, 0, sizeof(maxbin)); rv = recv(fd, st, sizeof(struct dlmc_state), MSG_WAITALL); if (!rv) break; if (rv != sizeof(struct dlmc_state)) break; if (st->str_len) { rv = recv(fd, str, st->str_len, MSG_WAITALL); if (rv != st->str_len) break; } if (st->bin_len) { rv = recv(fd, bin, st->bin_len, MSG_WAITALL); if (rv != st->bin_len) break; } switch (st->type) { case DLMC_STATE_DAEMON: print_daemon(st, str, bin, flags); break; case DLMC_STATE_STARTUP_NODE: startup_ids[startup_count++] = st->nodeid; break; case DLMC_STATE_DAEMON_NODE: if (flags & DLMC_STATUS_VERBOSE) { printf("nodeid %d\n", st->nodeid); print_str(str, st->str_len); } else { node_line = malloc(DLMC_STATE_MAXSTR); if (!node_line) break; fence_line = malloc(DLMC_STATE_MAXSTR); if (!fence_line) { free(node_line); break; } memset(node_line, 0, DLMC_STATE_MAXSTR); memset(fence_line, 0, DLMC_STATE_MAXSTR); format_daemon_node(st, str, bin, flags, node_line, fence_line); all_ids[all_count++] = st->nodeid; node_ids[node_count] = st->nodeid; node_lines[node_count] = node_line; node_count++; if (!fence_line[0]) { free(fence_line); } else { fence_ids[fence_count] = st->nodeid; fence_lines[fence_count] = fence_line; fence_count++; } } break; default: break; } if (rv < 0) break; } if (all_count) qsort(all_ids, all_count, sizeof(int), nodeid_compare); /* don't free any node_lines in this startup loop because we are just borrowing them; they are needed in the real node loop below. */ if (startup_count) { for (i = 0; i < startup_count; i++) { found_node = 0; for (j = 0; j < node_count; j++) { if (startup_ids[i] != node_ids[j]) continue; found_node = 1; if (!node_lines[j]) printf("startup node %d\n", st->nodeid); else printf("startup %s", node_lines[j]); break; } if (!found_node) printf("startup node %d\n", st->nodeid); } } if (all_count && fence_count) { for (i = 0; i < all_count; i++) { for (j = 0; j < fence_count; j++) { if (all_ids[i] != fence_ids[j]) continue; if (!fence_lines[j]) { printf("fence %d no data\n", fence_ids[j]); } else { printf("%s", fence_lines[j]); free(fence_lines[j]); fence_lines[j] = NULL; } break; } } } if (all_count && node_count) { for (i = 0; i < all_count; i++) { for (j = 0; j < node_count; j++) { if (all_ids[i] != node_ids[j]) continue; if (!node_lines[j]) { printf("node %d no data\n", node_ids[j]); } else { printf("%s", node_lines[j]); free(node_lines[j]); node_lines[j] = NULL; } break; } } } out_close: close(fd); out: return rv; } int dlmc_node_info(char *name, int nodeid, struct dlmc_node *node) { struct dlmc_header h, *rh; char reply[sizeof(struct dlmc_header) + sizeof(struct dlmc_node)]; int fd, rv; init_header(&h, DLMC_CMD_NODE_INFO, name, 0); h.data = nodeid; memset(reply, 0, sizeof(reply)); fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; rv = do_read(fd, reply, sizeof(reply)); if (rv < 0) goto out_close; rh = (struct dlmc_header *)reply; rv = rh->data; if (rv < 0) goto out_close; memcpy(node, (char *)reply + sizeof(struct dlmc_header), sizeof(struct dlmc_node)); out_close: close(fd); out: return rv; } int dlmc_lockspace_info(char *name, struct dlmc_lockspace *lockspace) { struct dlmc_header h, *rh; char reply[sizeof(struct dlmc_header) + sizeof(struct dlmc_lockspace)]; int fd, rv; init_header(&h, DLMC_CMD_LOCKSPACE_INFO, name, 0); memset(reply, 0, sizeof(reply)); fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; rv = do_read(fd, reply, sizeof(reply)); if (rv < 0) goto out_close; rh = (struct dlmc_header *)reply; rv = rh->data; if (rv < 0) goto out_close; memcpy(lockspace, (char *)reply + sizeof(struct dlmc_header), sizeof(struct dlmc_lockspace)); out_close: close(fd); out: return rv; } int dlmc_lockspaces(int *count, struct dlmc_lockspace **lss) { struct dlmc_header h; int reply_len; int fd, rv, result; fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } init_header(&h, DLMC_CMD_LOCKSPACES, NULL, 0); rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; rv = do_read(fd, &h, sizeof(h)); if (rv <0) goto out_close; result = h.data; if (result < 0) { rv = result; goto out_close; } *count = result; reply_len = h.len - sizeof(struct dlmc_header); *lss = malloc(reply_len); if (!*lss) { rv = -1; goto out; } memset(*lss, 0, reply_len); rv = do_read(fd, *lss, reply_len); if (rv < 0) { free(*lss); goto out; } rv = 0; out_close: close(fd); out: return rv; } int dlmc_lockspace_nodes(char *name, int type, int max, int *count, struct dlmc_node *nodes) { struct dlmc_header h, *rh; char *reply; int reply_len; int fd, rv, result, node_count; init_header(&h, DLMC_CMD_LOCKSPACE_NODES, name, 0); h.option = type; h.data = max; reply_len = sizeof(struct dlmc_header) + (max * sizeof(struct dlmc_node)); reply = malloc(reply_len); if (!reply) { rv = -1; goto out; } memset(reply, 0, reply_len); fd = do_connect(DLMC_QUERY_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; /* won't usually get back the full reply_len */ do_read(fd, reply, reply_len); rh = (struct dlmc_header *)reply; result = rh->data; if (result < 0 && result != -E2BIG) { rv = result; goto out_close; } if (result == -E2BIG) { *count = -E2BIG; node_count = max; } else { *count = result; node_count = result; } rv = 0; memcpy(nodes, (char *)reply + sizeof(struct dlmc_header), node_count * sizeof(struct dlmc_node)); out_close: close(fd); out: return rv; } int dlmc_fs_connect(void) { return do_connect(DLMC_SOCK_PATH); } void dlmc_fs_disconnect(int fd) { close(fd); } int dlmc_fs_register(int fd, char *name) { struct dlmc_header h; init_header(&h, DLMC_CMD_FS_REGISTER, name, 0); return do_write(fd, &h, sizeof(h)); } int dlmc_fs_unregister(int fd, char *name) { struct dlmc_header h; init_header(&h, DLMC_CMD_FS_UNREGISTER, name, 0); return do_write(fd, &h, sizeof(h)); } int dlmc_fs_notified(int fd, char *name, int nodeid) { struct dlmc_header h; init_header(&h, DLMC_CMD_FS_NOTIFIED, name, 0); h.data = nodeid; return do_write(fd, &h, sizeof(h)); } int dlmc_fs_result(int fd, char *name, int *type, int *nodeid, int *result) { struct dlmc_header h; int rv; rv = do_read(fd, &h, sizeof(h)); if (rv < 0) goto out; strncpy(name, h.name, DLM_LOCKSPACE_LEN); *nodeid = h.option; *result = h.data; switch (h.command) { case DLMC_CMD_FS_REGISTER: *type = DLMC_RESULT_REGISTER; break; case DLMC_CMD_FS_NOTIFIED: *type = DLMC_RESULT_NOTIFIED; break; default: *type = 0; } out: return rv; } int dlmc_deadlock_check(char *name) { struct dlmc_header h; int fd, rv; init_header(&h, DLMC_CMD_DEADLOCK_CHECK, name, 0); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); close(fd); out: return rv; } int dlmc_fence_ack(char *name) { struct dlmc_header h; int fd, rv; init_header(&h, DLMC_CMD_FENCE_ACK, name, 0); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); close(fd); out: return rv; } int dlmc_run_start(char *run_command, int len, int nodeid, uint32_t flags, char *run_uuid) { struct dlmc_header h; struct dlmc_header rh; char *cmdbuf; int fd, rv; cmdbuf = malloc(DLMC_RUN_COMMAND_LEN); if (!cmdbuf) return -1; memset(cmdbuf, 0, DLMC_RUN_COMMAND_LEN); strncpy(cmdbuf, run_command, DLMC_RUN_COMMAND_LEN-1); init_header(&h, DLMC_CMD_RUN_START, NULL, DLMC_RUN_COMMAND_LEN); h.data = nodeid; h.flags = flags; memset(&rh, 0, sizeof(rh)); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } rv = do_write(fd, &h, sizeof(h)); if (rv < 0) goto out_close; rv = do_write(fd, cmdbuf, DLMC_RUN_COMMAND_LEN); if (rv < 0) goto out_close; rv = do_read(fd, &rh, sizeof(rh)); if (rv < 0) goto out_close; rv = rh.data; if (rv < 0) goto out_close; memcpy(run_uuid, rh.name, DLMC_RUN_UUID_LEN); rv = 0; out_close: close(fd); out: free(cmdbuf); return rv; } int dlmc_run_check(char *run_uuid, int len, int wait_sec, uint32_t flags, uint32_t *check_status) { struct dlmc_header h; struct dlmc_header rh; struct dlmc_run_check_state state; uint64_t wait_start = 0; int fd, rv; init_header(&h, DLMC_CMD_RUN_CHECK, NULL, 0); h.flags = flags; init_header_name(&h, run_uuid, DLMC_RUN_UUID_LEN); memset(&rh, 0, sizeof(rh)); fd = do_connect(DLMC_SOCK_PATH); if (fd < 0) { rv = fd; goto out; } retry: rv = do_write(fd, &h, sizeof(h)); if (rv < 0) { goto out_close; } rv = do_read(fd, &rh, sizeof(rh)); if (rv < 0) { goto out_close; } rv = do_read(fd, &state, sizeof(state)); if (rv < 0) { goto out_close; } if ((state.check_status & DLMC_RUN_STATUS_WAITING) && wait_sec) { if (!wait_start) { wait_start = time(NULL); sleep(1); goto retry; } if (time(NULL) - wait_start < wait_sec) { sleep(1); goto retry; } } *check_status = state.check_status; rv = 0; out_close: close(fd); out: return rv; } dlm-4.3.0/dlm_controld/libdlmcontrol.h000066400000000000000000000130311461150666200177740ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef _LIBDLMCONTROL_H_ #define _LIBDLMCONTROL_H_ #define DLMC_DUMP_SIZE (1024 * 1024) #define DLMC_NF_MEMBER 0x00000001 /* node is member in cg */ #define DLMC_NF_START 0x00000002 /* start message recvd for cg */ #define DLMC_NF_DISALLOWED 0x00000004 /* node disallowed in cg */ #define DLMC_NF_NEED_FENCING 0x00000008 #define DLMC_NF_CHECK_FS 0x00000010 struct dlmc_node { int nodeid; uint32_t flags; uint32_t added_seq; uint32_t removed_seq; int fail_reason; uint64_t fail_walltime; uint64_t fail_monotime; }; #define DLMC_LS_WAIT_RINGID 1 #define DLMC_LS_WAIT_QUORUM 2 #define DLMC_LS_WAIT_FENCING 3 #define DLMC_LS_WAIT_FSDONE 4 struct dlmc_change { int member_count; int joined_count; int remove_count; int failed_count; int wait_condition; /* DLMC_LS_WAIT or needed message count */ int wait_messages; /* 0 no, 1 yes */ uint32_t seq; uint32_t combined_seq; }; #define DLMC_LF_JOINING 0x00000001 #define DLMC_LF_LEAVING 0x00000002 #define DLMC_LF_KERNEL_STOPPED 0x00000004 #define DLMC_LF_FS_REGISTERED 0x00000008 #define DLMC_LF_NEED_PLOCKS 0x00000010 #define DLMC_LF_SAVE_PLOCKS 0x00000020 struct dlmc_lockspace { struct dlmc_change cg_prev; /* completed change (started_change) */ struct dlmc_change cg_next; /* in-progress change (changes list) */ uint32_t flags; uint32_t global_id; char name[DLM_LOCKSPACE_LEN+1]; }; /* dlmc_lockspace_nodes() types MEMBERS: members in completed (prev) change, zero if there's no completed (prev) change NEXT: members in in-progress (next) change, zero if there's no in-progress (next) change ALL: NEXT + nonmembers if there's an in-progress (next) change, MEMBERS + nonmembers if there's no in-progress (next) change, but there is a completed (prev) change nonmembers if there's no in-progress (next) or completed (prev) change (possible?) dlmc_node_info() returns info for in-progress (next) change, if one exists, otherwise it returns info for completed (prev) change. */ #define DLMC_NODES_ALL 1 #define DLMC_NODES_MEMBERS 2 #define DLMC_NODES_NEXT 3 #define DLMC_STATUS_VERBOSE 0x00000001 int dlmc_dump_debug(char *buf, int *data); int dlmc_dump_config(char *buf, int *data); int dlmc_dump_run(char *buf, int *data); int dlmc_dump_log_plock(char *buf, int *data); int dlmc_dump_plocks(char *name, char *buf, int *data); int dlmc_lockspace_info(char *lsname, struct dlmc_lockspace *ls); int dlmc_node_info(char *lsname, int nodeid, struct dlmc_node *node); /* caller need to free *lss */ int dlmc_lockspaces(int *count, struct dlmc_lockspace **lss); int dlmc_lockspace_nodes(char *lsname, int type, int max, int *count, struct dlmc_node *nodes); int dlmc_print_status(uint32_t flags); int dlmc_reload_config(void); int dlmc_set_config(char *command); #define DLMC_RESULT_REGISTER 1 #define DLMC_RESULT_NOTIFIED 2 int dlmc_fs_connect(void); void dlmc_fs_disconnect(int fd); int dlmc_fs_register(int fd, char *name); int dlmc_fs_unregister(int fd, char *name); int dlmc_fs_notified(int fd, char *name, int nodeid); int dlmc_fs_result(int fd, char *name, int *type, int *nodeid, int *result); int dlmc_deadlock_check(char *name); int dlmc_fence_ack(char *name); #define DLMC_RUN_UUID_LEN DLM_LOCKSPACE_LEN #define DLMC_RUN_COMMAND_LEN 1024 /* * Run a command on all nodes running dlm_controld. * * The node where dlmc_run_start() is called will * send a corosync message to all nodes running * dlm_controld, telling them to run the specified * command. * * On all the nodes, a dlm_controld helper process will * fork/exec the specified command, and will send a * corosync message with the result of the command. * * (A flag specifies whether the starting node itself * runs the command. A nodeid arg can specify one node * to run the command.) * * The starting node will collect the results from * the replies. * * The node where dlmc_run_start() was called can * run dlmc_run_check() to check the cummulative * result of the command from all the nodes. */ /* * dlmc_run_start() flags * * NODE_NONE: do not run the command on the starting node * (where dlmc_run_start is called) * NODE_FIRST: run the command on the starting node right * when dlmc_run_start is called. * NODE_RECV: run the command on the starting node when * it receives its own run request message. */ #define DLMC_FLAG_RUN_START_NODE_NONE 0x00000001 #define DLMC_FLAG_RUN_START_NODE_FIRST 0x00000002 #define DLMC_FLAG_RUN_START_NODE_RECV 0x00000004 /* * dlmc_run_check() flags * * CLEAR: clear/free the run record when check sees it is done. * * CANCEL: clear/free a local run record even if it's not done. */ #define DLMC_FLAG_RUN_CHECK_CLEAR 0x00000001 #define DLMC_FLAG_RUN_CHECK_CANCEL 0x00000002 /* * dlmc_run_check() result/status flags * * WAITING: have not received all expected replies * DONE: have received all expected replies * FAILED: have seen one or more replies with failed result */ #define DLMC_RUN_STATUS_WAITING 0x00000001 #define DLMC_RUN_STATUS_DONE 0x00000002 #define DLMC_RUN_STATUS_FAILED 0x00000004 int dlmc_run_start(char *run_command, int len, int nodeid, uint32_t flags, char *run_uuid); int dlmc_run_check(char *run_uuid, int len, int wait_sec, uint32_t flags, uint32_t *check_status); #endif dlm-4.3.0/dlm_controld/libdlmcontrol.pc.in000066400000000000000000000003221461150666200205530ustar00rootroot00000000000000prefix=@PREFIX@ exec_prefix=${prefix} includedir=${prefix}/include libdir=@LIBDIR@ Name: libdlmcontrol Description: The dlmcontrol library Version: 4.0.0 Cflags: -I${includedir} Libs: -L${libdir} -ldlmcontrol dlm-4.3.0/dlm_controld/linux_endian.h000066400000000000000000000033031461150666200176060ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LINUX_ENDIAN_DOT_H__ #define __LINUX_ENDIAN_DOT_H__ #include #include /* I'm not sure which versions of alpha glibc/gcc are broken, so fix all of them. */ #ifdef __alpha__ #undef bswap_64 static __inline__ unsigned long bswap_64(unsigned long x) { unsigned int h = x >> 32; unsigned int l = x; h = bswap_32(h); l = bswap_32(l); return ((unsigned long)l << 32) | h; } #endif /* __alpha__ */ #if __BYTE_ORDER == __BIG_ENDIAN #define be16_to_cpu(x) (x) #define be32_to_cpu(x) (x) #define be64_to_cpu(x) (x) #define cpu_to_be16(x) (x) #define cpu_to_be32(x) (x) #define cpu_to_be64(x) (x) #define le16_to_cpu(x) (bswap_16((x))) #define le32_to_cpu(x) (bswap_32((x))) #define le64_to_cpu(x) (bswap_64((x))) #define cpu_to_le16(x) (bswap_16((x))) #define cpu_to_le32(x) (bswap_32((x))) #define cpu_to_le64(x) (bswap_64((x))) #endif /* __BYTE_ORDER == __BIG_ENDIAN */ #if __BYTE_ORDER == __LITTLE_ENDIAN #define be16_to_cpu(x) (bswap_16((x))) #define be32_to_cpu(x) (bswap_32((x))) #define be64_to_cpu(x) (bswap_64((x))) #define cpu_to_be16(x) (bswap_16((x))) #define cpu_to_be32(x) (bswap_32((x))) #define cpu_to_be64(x) (bswap_64((x))) #define le16_to_cpu(x) (x) #define le32_to_cpu(x) (x) #define le64_to_cpu(x) (x) #define cpu_to_le16(x) (x) #define cpu_to_le32(x) (x) #define cpu_to_le64(x) (x) #endif /* __BYTE_ORDER == __LITTLE_ENDIAN */ #endif /* __LINUX_ENDIAN_DOT_H__ */ dlm-4.3.0/dlm_controld/linux_helpers.h000066400000000000000000000035431461150666200200200ustar00rootroot00000000000000/* Copied from linux kernel */ #ifndef __DLM_LINUX_HELPERS__ #define __DLM_LINUX_HELPERS__ /* * static_assert - check integer constant expression at build time * * static_assert() is a wrapper for the C11 _Static_assert, with a * little macro magic to make the message optional (defaulting to the * stringification of the tested expression). * * Contrary to BUILD_BUG_ON(), static_assert() can be used at global * scope, but requires the expression to be an integer constant * expression (i.e., it is not enough that __builtin_constant_p() is * true for expr). * * Also note that BUILD_BUG_ON() fails the build if the condition is * true, while static_assert() fails the build if the expression is * false. */ #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr) #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #define POISON_POINTER_DELTA 0xdeadbeef #define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA) /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * * WARNING: any const qualifier of @ptr is lost. */ #define container_of(ptr, type, member) ({ \ void *__mptr = (void *)(ptr); \ static_assert(__same_type(*(ptr), ((type *)0)->member) || \ __same_type(*(ptr), void), \ "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) #define READ_ONCE(x) (*(const volatile typeof(x) *)&(x)) #define WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #endif /* __DLM_LINUX_HELPERS__ */ dlm-4.3.0/dlm_controld/list.h000066400000000000000000000561301461150666200161120ustar00rootroot00000000000000/* Copied from include/linux/list.h */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include #include "linux_helpers.h" struct list_head { struct list_head *next, *prev; }; /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ (&pos->member == (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) #endif dlm-4.3.0/dlm_controld/logging.c000066400000000000000000000107131461150666200165550ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" static int syslog_facility; static int syslog_priority; static int logfile_priority; static char logfile[PATH_MAX]; static FILE *logfile_fp; /* logfile_priority is the only one of these options that can be controlled from command line, environment variable and dynamic setting. */ void set_logfile_priority(void) { if (opt(debug_logfile_ind)) logfile_priority = LOG_DEBUG; else logfile_priority = DEFAULT_LOGFILE_PRIORITY; } void init_logging(void) { mode_t old_umask; int rv; syslog_facility = DEFAULT_SYSLOG_FACILITY; syslog_priority = DEFAULT_SYSLOG_PRIORITY; logfile_priority = DEFAULT_LOGFILE_PRIORITY; strcpy(logfile, DEFAULT_LOGFILE); set_logfile_priority(); old_umask = umask(0077); rv = mkdir(SYS_VARDIR, 0700); if (rv < 0 && errno != EEXIST) { umask(old_umask); goto skip_logfile; } rv = mkdir(SYS_LOGDIR, 0700); if (rv < 0 && errno != EEXIST) { umask(old_umask); goto skip_logfile; } rv = mkdir(LOGDIR, 0700); if (rv < 0 && errno != EEXIST) { umask(old_umask); goto skip_logfile; } umask(old_umask); if (logfile[0]) { logfile_fp = fopen(logfile, "a+"); if (logfile_fp != NULL) { int fd = fileno(logfile_fp); fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC); } } skip_logfile: openlog(DAEMON_NAME, LOG_CONS | LOG_PID, syslog_facility); } void close_logging(void) { closelog(); if (logfile_fp) fclose(logfile_fp); } #define NAME_ID_SIZE 32 #define LOG_STR_LEN 512 static char log_str[LOG_STR_LEN]; static char log_dump[LOG_DUMP_SIZE]; static unsigned int log_point; static unsigned int log_wrap; static char log_dump_plock[LOG_DUMP_SIZE]; static unsigned int log_point_plock; static unsigned int log_wrap_plock; static void log_copy(char *buf, int *len, char *log_buf, unsigned int *point, unsigned int *wrap) { unsigned int p = *point; unsigned int w = *wrap; int tail_len; if (!w && !p) { *len = 0; } else if (*wrap) { tail_len = LOG_DUMP_SIZE - p; memcpy(buf, log_buf + p, tail_len); if (p) memcpy(buf+tail_len, log_buf, p); *len = LOG_DUMP_SIZE; } else { memcpy(buf, log_buf, p-1); *len = p-1; } } void copy_log_dump(char *buf, int *len) { log_copy(buf, len, log_dump, &log_point, &log_wrap); } void copy_log_dump_plock(char *buf, int *len) { log_copy(buf, len, log_dump_plock, &log_point_plock, &log_wrap_plock); } static void log_save_str(int len, char *log_buf, unsigned int *point, unsigned int *wrap) { unsigned int p = *point; unsigned int w = *wrap; int i; if (len < LOG_DUMP_SIZE - p) { memcpy(log_buf + p, log_str, len); p += len; if (p == LOG_DUMP_SIZE) { p = 0; w = 1; } goto out; } for (i = 0; i < len; i++) { log_buf[p++] = log_str[i]; if (p == LOG_DUMP_SIZE) { p = 0; w = 1; } } out: *point = p; *wrap = w; } void log_level(const char *name_in, uint32_t level_in, const char *fmt, ...) { va_list ap; char name[NAME_ID_SIZE + 2]; uint32_t level = level_in & 0x0000FFFF; uint32_t extra = level_in & 0xFFFF0000; int ret, pos = 0; int len = LOG_STR_LEN - 2; int namelen = 0; int plock = extra & LOG_PLOCK; memset(name, 0, sizeof(name)); if (name_in) { namelen = snprintf(name, NAME_ID_SIZE + 1, "%s", name_in); if (namelen > NAME_ID_SIZE) namelen = NAME_ID_SIZE; name[namelen] = ' '; name[namelen+1] = '\0'; } ret = snprintf(log_str + pos, len - pos, "%llu %s", (unsigned long long)monotime(), name); pos += ret; va_start(ap, fmt); ret = vsnprintf(log_str + pos, len - pos, fmt, ap); va_end(ap); if (ret >= len - pos) pos = len - 1; else pos += ret; log_str[pos++] = '\n'; log_str[pos++] = '\0'; if (level < LOG_NONE) log_save_str(pos - 1, log_dump, &log_point, &log_wrap); if (plock) log_save_str(pos - 1, log_dump_plock, &log_point_plock, &log_wrap_plock); if (level <= syslog_priority) syslog(level, "%s", log_str); if (level <= logfile_priority && logfile_fp) { time_t logtime = time(NULL); char tbuf[64]; strftime(tbuf, sizeof(tbuf), "%b %d %T", localtime(&logtime)); fprintf(logfile_fp, "%s %s", tbuf, log_str); fflush(logfile_fp); } if (!dlm_options[daemon_debug_ind].use_int) return; if ((level < LOG_NONE) || (plock && opt(plock_debug_ind))) fprintf(stderr, "%s", log_str); } dlm-4.3.0/dlm_controld/main.c000066400000000000000000001362651461150666200160660ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #define EXTERN #include "dlm_daemon.h" #include #include #include #include #include #ifdef USE_SD_NOTIFY #include #endif #include "copyright.cf" #include "version.cf" #define CLIENT_NALLOC 32 static int client_maxi; static int client_size = 0; static struct client *client = NULL; static struct pollfd *pollfd = NULL; static pthread_t query_thread; static pthread_mutex_t query_mutex; static struct list_head fs_register_list; static int kernel_monitor_fd; int helper_ci; int helper_pid; int helper_req_fd; int helper_status_fd; uint64_t helper_last_status; uint32_t helper_full_count; struct client { int fd; void *workfn; void *deadfn; struct lockspace *ls; }; enum { Env_ACTION = 0, Env_DEVPATH, Env_SUBSYSTEM, Env_LOCKSPACE, Env_Last, /* Flag for end of vars */ }; static const char *uevent_vars[] = { [Env_ACTION] = "ACTION=", [Env_DEVPATH] = "DEVPATH=", [Env_SUBSYSTEM] = "SUBSYSTEM=", [Env_LOCKSPACE] = "LOCKSPACE=", }; static void decode_uevent(const char *buf, unsigned len, const char *vars[], unsigned nvars, const char *vals[]) { const char *ptr; unsigned int i; int slen, vlen; memset(vals, 0, sizeof(const char *) * nvars); while (len > 0) { ptr = buf; slen = strlen(ptr); buf += slen; len -= slen; buf++; len--; for (i = 0; i < nvars; i++) { vlen = strlen(vars[i]); if (vlen > slen) continue; if (memcmp(vars[i], ptr, vlen) != 0) continue; vals[i] = ptr + vlen; break; } } } int do_read(int fd, void *buf, size_t count) { int rv, off = 0; while (off < count) { rv = read(fd, (char *)buf + off, count - off); if (rv == 0) return -1; if (rv == -1 && errno == EINTR) continue; if (rv == -1) return -1; off += rv; } return 0; } int do_write(int fd, void *buf, size_t count) { int rv, off = 0; retry: rv = write(fd, (char *)buf + off, count); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) { log_error("write errno %d", errno); return rv; } if (rv != count) { count -= rv; off += rv; goto retry; } return 0; } uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } static void client_alloc(void) { int i; if (!client) { client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); } else { client = realloc(client, (client_size + CLIENT_NALLOC) * sizeof(struct client)); pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) * sizeof(struct pollfd)); if (!pollfd) log_error("can't alloc for pollfd"); } if (!client || !pollfd) log_error("can't alloc for client array"); for (i = client_size; i < client_size + CLIENT_NALLOC; i++) { client[i].workfn = NULL; client[i].deadfn = NULL; client[i].fd = -1; pollfd[i].fd = -1; pollfd[i].revents = 0; } client_size += CLIENT_NALLOC; } void client_dead(int ci) { close(client[ci].fd); client[ci].workfn = NULL; client[ci].fd = -1; pollfd[ci].fd = -1; } int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { int i; if (!client) client_alloc(); again: for (i = 0; i < client_size; i++) { if (client[i].fd == -1) { client[i].workfn = workfn; if (deadfn) client[i].deadfn = deadfn; else client[i].deadfn = client_dead; client[i].fd = fd; pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; return i; } } client_alloc(); goto again; } int client_fd(int ci) { return client[ci].fd; } void client_ignore(int ci, int fd) { pollfd[ci].fd = -1; pollfd[ci].events = 0; } void client_back(int ci, int fd) { pollfd[ci].fd = fd; pollfd[ci].events = POLLIN; } static void sigterm_handler(int sig) { daemon_quit = 1; } static void sigchld_handler(int sig) { } struct run *find_run(char *uuid_str) { struct run *run; list_for_each_entry(run, &run_ops, list) { if (!strcmp(run->uuid, uuid_str)) return run; } return NULL; } static void close_helper(void) { close(helper_req_fd); close(helper_status_fd); helper_req_fd = -1; helper_status_fd = -1; pollfd[helper_ci].fd = -1; pollfd[helper_ci].events = 0; helper_ci = -1; /* don't set helper_pid = -1 until we've tried waitpid */ } /* * We cannot block the main thread on this write, so the pipe is NONBLOCK, and * write fails with EAGAIN when the pipe is full. With around 1.5K request * size and 64k default pipe size, the pipe will be full if we quickly send * around 40 requests to the helper. We retry the message once a second, so * we'll retry the write again in a second. * * By setting the pipe size to 1MB in setup_helper, we could quickly send many * more requests before getting EAGAIN. */ void send_helper_run_request(struct run_request *req) { int rv; if (helper_req_fd == -1) { log_error("send_helper_run_request no fd"); return; } retry: rv = write(helper_req_fd, req, sizeof(struct run_request)); if (rv == -1 && errno == EINTR) goto retry; /* pipe is full, we'll try again in a second */ if (rv == -1 && errno == EAGAIN) { helper_full_count++; log_debug("send_helper_run_request full_count %u", helper_full_count); return; } /* helper exited or closed fd, quit using helper */ if (rv == -1 && errno == EPIPE) { log_error("send_helper_run_request EPIPE"); close_helper(); return; } if (rv != sizeof(struct run_request)) { /* this shouldn't happen */ log_error("send_helper_run_request %d %d", rv, errno); close_helper(); return; } } static void send_helper_run_cancel(struct run *run) { struct run_request req; int rv; if (helper_req_fd == -1) { log_error("send_helper_run_cancel no fd"); return; } memset(&req, 0, sizeof(req)); memcpy(req.uuid, run->uuid, RUN_UUID_LEN); rv = write(helper_req_fd, &req, sizeof(struct run_request)); if (rv < 0) log_error("send_helper_run_cancel write error"); } /* * first pipe for daemon to send requests to helper; they are not acknowledged * and the daemon does not get any result back for the requests. * * second pipe for helper to send general status/heartbeat back to the daemon * every so often to confirm it's not dead/hung. If the helper gets stuck or * killed, the daemon will not get the status and won't bother sending requests * to the helper, and use SIGTERM instead */ static int setup_helper(void) { int pid; int pw_fd = -1; /* parent write */ int cr_fd = -1; /* child read */ int pr_fd = -1; /* parent read */ int cw_fd = -1; /* child write */ int pfd[2]; /* we can't allow the main daemon thread to block */ if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC)) return -errno; /* uncomment for rhel7 where this should be available */ /* fcntl(pfd[1], F_SETPIPE_SZ, 1024*1024); */ cr_fd = pfd[0]; pw_fd = pfd[1]; if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC)) { close(cr_fd); close(pw_fd); return -errno; } pr_fd = pfd[0]; cw_fd = pfd[1]; pid = fork(); if (pid < 0) { close(cr_fd); close(pw_fd); close(pr_fd); close(cw_fd); return -errno; } if (pid) { close(cr_fd); close(cw_fd); helper_req_fd = pw_fd; helper_status_fd = pr_fd; helper_pid = pid; return 0; } else { close(pr_fd); close(pw_fd); run_helper(cr_fd, cw_fd, opt(daemon_debug_ind)); exit(0); } } static void process_helper(int ci) { struct run_reply reply; struct run_reply send_reply; struct run *run; int rv; rv = read(client[ci].fd, &reply, sizeof(reply)); if (!rv || rv == -EAGAIN) return; if (rv < 0) { log_error("process_helper rv %d errno %d", rv, errno); goto fail; } if (rv != sizeof(reply)) { log_error("process_helper recv size %d", rv); goto fail; } if (!reply.header.type) { /* log_debug("helper status"); */ helper_last_status = monotime(); return; } if (reply.header.type == DLM_MSG_RUN_REPLY) { run = find_run(reply.uuid); if (!run) { log_error("helper reply no run uuid %s", reply.uuid); return; } memset(&send_reply, 0, sizeof(send_reply)); memcpy(&send_reply.info, &run->info, sizeof(struct run_info)); memcpy(send_reply.uuid, run->uuid, RUN_UUID_LEN); send_reply.header.type = DLM_MSG_RUN_REPLY; send_reply.info.local_pid = reply.info.local_pid; send_reply.info.local_result = reply.info.local_result; log_debug("helper reply %s pid %d result %d", send_reply.uuid, send_reply.info.local_pid, send_reply.info.local_result); send_run_reply(run, &send_reply); return; } return; fail: close_helper(); } static void helper_dead(int ci) { int pid = helper_pid; int rv, status; close_helper(); helper_pid = -1; rv = waitpid(pid, &status, WNOHANG); if (rv != pid) { /* should not happen */ log_error("helper pid %d dead wait %d", pid, rv); return; } if (WIFEXITED(status)) { log_error("helper pid %d exit status %d", pid, WEXITSTATUS(status)); return; } if (WIFSIGNALED(status)) { log_error("helper pid %d term signal %d", pid, WTERMSIG(status)); return; } /* should not happen */ log_error("helper pid %d state change", pid); } static int start_run_operation(char *data, int datalen, int dest_nodeid, uint32_t flags, char *uuid_out) { struct run *run; struct run_request req; uuid_t uu; int rv; if (!opt(enable_helper_ind)) { log_debug("ignore start_run helper not enabled"); return -1; } if (datalen > RUN_COMMAND_LEN) return -1; if (!(run = malloc(sizeof(struct run)))) return -1; memset(run, 0, sizeof(struct run)); uuid_generate(uu); uuid_unparse_lower(uu, run->uuid); strncpy(run->command, data, datalen); run->info.start_nodeid = our_nodeid; run->info.dest_nodeid = dest_nodeid; run->info.flags = flags; memset(&req, 0, sizeof(req)); req.header.type = DLM_MSG_RUN_REQUEST; memcpy(&req.info, &run->info, sizeof(struct run_info)); memcpy(req.uuid, run->uuid, RUN_UUID_LEN); strncpy(req.command, data, datalen); log_error("run start %s %.128s", run->uuid, run->command); rv = send_run_request(run, &req); memcpy(uuid_out, run->uuid, RUN_UUID_LEN); list_add(&run->list, &run_ops); /* * This flag means the starting node should run the command itself * at the time of the request and not wait to receive its own request. */ if (flags & DLMC_FLAG_RUN_START_NODE_FIRST) send_helper_run_request(&req); return rv; } void clear_run(struct run *run) { log_debug("clear run %s", run->uuid); list_del(&run->list); free(run); } static int check_run_operation(char *uuid_str, uint32_t flags, struct dlmc_run_check_state *state) { char nodes_buf[128]; struct run *run; int pos, len, ret, i; if (!opt(enable_helper_ind)) { log_debug("ignore check_run helper not enabled"); return -1; } run = find_run(uuid_str); if (!run) { log_debug("check_run no uuid %s", uuid_str); return -1; } if (flags & DLMC_FLAG_RUN_CHECK_CANCEL) { log_debug("cancel_run %s", run->uuid); send_helper_run_cancel(run); clear_run(run); return 0; } log_debug("check_run %s reply_count %d need_replies %d fail_count %d", uuid_str, run->info.reply_count, run->info.need_replies, run->info.fail_count); if (run->info.need_replies) state->check_status |= DLMC_RUN_STATUS_WAITING; else state->check_status |= DLMC_RUN_STATUS_DONE; if (run->info.fail_count) state->check_status |= DLMC_RUN_STATUS_FAILED; if (!run->info.need_replies) { if (run->info.fail_count) { /* create a printable list of nodeids where the command failed */ pos = 0; len = sizeof(nodes_buf); memset(nodes_buf, 0, len); for (i = 0; i < run->node_count; i++) { if (!run->node_results[i].result) continue; ret = snprintf(nodes_buf + pos, len - pos, "%d ", run->node_results[i].nodeid); if (ret >= len - pos) break; pos += ret; } nodes_buf[len-1] = '\0'; log_error("run ended %s error from %d remote nodes with ids: %s", run->uuid, run->info.fail_count, nodes_buf); } else { log_error("run ended %s success from %d remote nodes", run->uuid, run->info.reply_count); } } if (!run->info.need_replies && (flags & DLMC_FLAG_RUN_CHECK_CLEAR)) clear_run(run); return 0; } static struct lockspace *create_ls(const char *name) { struct lockspace *ls; ls = malloc(sizeof(*ls)); if (!ls) goto out; memset(ls, 0, sizeof(struct lockspace)); strncpy(ls->name, name, DLM_LOCKSPACE_LEN); INIT_LIST_HEAD(&ls->changes); INIT_LIST_HEAD(&ls->node_history); INIT_LIST_HEAD(&ls->saved_messages); INIT_LIST_HEAD(&ls->plock_resources); ls->plock_resources_root = RB_ROOT; #if 0 INIT_LIST_HEAD(&ls->deadlk_nodes); INIT_LIST_HEAD(&ls->transactions); INIT_LIST_HEAD(&ls->resources); #endif setup_lockspace_config(ls); out: return ls; } struct lockspace *find_ls(const char *name) { struct lockspace *ls; list_for_each_entry(ls, &lockspaces, list) { if ((strlen(ls->name) == strlen(name)) && !strncmp(ls->name, name, strlen(name))) return ls; } return NULL; } struct lockspace *find_ls_id(uint32_t id) { struct lockspace *ls; list_for_each_entry(ls, &lockspaces, list) { if (ls->global_id == id) return ls; } return NULL; } struct fs_reg { struct list_head list; char name[DLM_LOCKSPACE_LEN+1]; }; static int fs_register_check(char *name) { struct fs_reg *fs; list_for_each_entry(fs, &fs_register_list, list) { if (!strcmp(name, fs->name)) return 1; } return 0; } static int fs_register_add(char *name) { struct fs_reg *fs; if (fs_register_check(name)) return -EALREADY; fs = malloc(sizeof(struct fs_reg)); if (!fs) return -ENOMEM; strncpy(fs->name, name, DLM_LOCKSPACE_LEN); list_add(&fs->list, &fs_register_list); return 0; } static void fs_register_del(char *name) { struct fs_reg *fs; list_for_each_entry(fs, &fs_register_list, list) { if (!strcmp(name, fs->name)) { list_del(&fs->list); free(fs); return; } } } const char *dlm_mode_str(int mode) { switch (mode) { case DLM_LOCK_IV: return "IV"; case DLM_LOCK_NL: return "NL"; case DLM_LOCK_CR: return "CR"; case DLM_LOCK_CW: return "CW"; case DLM_LOCK_PR: return "PR"; case DLM_LOCK_PW: return "PW"; case DLM_LOCK_EX: return "EX"; } return "??"; } /* recv "online" (join) and "offline" (leave) messages from dlm via uevents */ #define MAX_LINE_UEVENT 4096 static void process_uevent(int ci) { const char *uevent_vals[Env_Last]; struct lockspace *ls; char buf[MAX_LINE_UEVENT]; int rv; memset(buf, 0, sizeof(buf)); retry_recv: rv = recv(client[ci].fd, &buf, sizeof(buf), 0); if (rv < 0) { if (errno == EINTR) goto retry_recv; if (errno != EAGAIN) log_error("uevent recv error %d errno %d", rv, errno); return; } decode_uevent(buf, rv, uevent_vars, Env_Last, uevent_vals); if (!uevent_vals[Env_ACTION] || !uevent_vals[Env_DEVPATH] || !uevent_vals[Env_SUBSYSTEM] || !uevent_vals[Env_LOCKSPACE]) { log_debug("failed to validate uevent, action: %p, devpath: %p, subsystem: %p, lockspace: %p", uevent_vals[Env_ACTION], uevent_vals[Env_DEVPATH], uevent_vals[Env_SUBSYSTEM], uevent_vals[Env_LOCKSPACE]); return; } if (strcmp(uevent_vals[Env_SUBSYSTEM], "dlm")) { log_debug("uevent looks like dlm but came not from dlm subsystem"); return; } log_debug("uevent action: %s, devpath: %s, devpath: %s, lockspace: %s", uevent_vals[Env_ACTION], uevent_vals[Env_SUBSYSTEM], uevent_vals[Env_DEVPATH], uevent_vals[Env_LOCKSPACE]); rv = 0; if (!strcmp(uevent_vals[Env_ACTION], "online")) { ls = find_ls(uevent_vals[Env_LOCKSPACE]); if (ls) { rv = -EEXIST; goto out; } ls = create_ls(uevent_vals[Env_LOCKSPACE]); if (!ls) { rv = -ENOMEM; goto out; } if (fs_register_check(ls->name)) ls->fs_registered = 1; rv = dlm_join_lockspace(ls); if (rv) { /* ls already freed */ goto out; } } else if (!strcmp(uevent_vals[Env_ACTION], "offline")) { ls = find_ls(uevent_vals[Env_LOCKSPACE]); if (!ls) { rv = -ENOENT; goto out; } dlm_leave_lockspace(ls); } out: if (rv < 0) log_error("%s action: %s, devpath: %s, devpath: %s, lockspace: %s - error %d errno %d", __func__, uevent_vals[Env_ACTION], uevent_vals[Env_SUBSYSTEM], uevent_vals[Env_DEVPATH], uevent_vals[Env_LOCKSPACE], rv, errno); } static int setup_uevent(void) { struct sockaddr_nl snl; int s, rv, val; s = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT); if (s < 0) { log_error("uevent netlink socket"); return s; } /* man 7 netlink: * * However, reliable transmissions from kernel to user are impossible in * any case. The kernel can't send a netlink message if the socket buffer * is full: the message will be dropped and the kernel and the user-space * process will no longer have the same view of kernel state. It is up to * the application to detect when this happens (via the ENOBUFS error * returned by recvmsg(2)) and resynchronize. * * To avoid ENOBUFS errors we set the netlink socket to realiable * transmission mode which can be turned on by NETLINK_NO_ENOBUFS * option. This option is available since kernel 2.6.30. If this setting * fails we fallback to increase the netlink socket receive buffer. */ val = 1; rv = setsockopt(s, SOL_NETLINK, NETLINK_NO_ENOBUFS, &val, sizeof(val)); if (rv == -1) { /* Fallback handling if NETLINK_NO_ENOBUFS fails to set. * * To prevent ENOBUFS errors we just set the receive buffer to * two megabyte as other applications do it. This will not * ensure that we never receive ENOBUFS but it's more unlikely. */ val = DEFAULT_NETLINK_RCVBUF; log_error("uevent netlink NETLINK_NO_ENOBUFS errno %d, will set rcvbuf to %d bytes", errno, val); rv = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); if (rv == -1) log_error("uevent netlink SO_RCVBUF errno %d", errno); rv = setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val)); if (rv == -1) log_error("uevent netlink SO_RCVBUFFORCE errno %d", errno); } memset(&snl, 0, sizeof(snl)); snl.nl_family = AF_NETLINK; snl.nl_pid = getpid(); snl.nl_groups = 1; rv = bind(s, (struct sockaddr *) &snl, sizeof(snl)); if (rv < 0) { log_error("uevent bind error %d errno %d", rv, errno); close(s); return rv; } return s; } static inline void init_header_name(struct dlmc_header *h, const char *name, size_t len) { #pragma GCC diagnostic push #if __GNUC__ >= 8 #pragma GCC diagnostic ignored "-Wstringop-truncation" #endif strncpy(h->name, name, len); #pragma GCC diagnostic pop } static void init_header(struct dlmc_header *h, int cmd, char *name, int result, int extra_len) { memset(h, 0, sizeof(struct dlmc_header)); h->magic = DLMC_MAGIC; h->version = DLMC_VERSION; h->len = sizeof(struct dlmc_header) + extra_len; h->command = cmd; h->data = result; if (name) init_header_name(h, name, DLM_LOCKSPACE_LEN); } static char copy_buf[LOG_DUMP_SIZE]; static void copy_run_list(char *buf, int *len) { char tmp[1024]; struct run *run; int ret, pos = 0; list_for_each_entry(run, &run_ops, list) { memset(tmp, 0, sizeof(tmp)); snprintf(tmp, 1024, "run_uuid %s start_nodeid %d local_pid %d local_result %d need_replies %d reply_count %d fail_count %d flags %x\n", run->uuid, run->info.start_nodeid, run->info.local_pid, run->info.local_result, run->info.need_replies, run->info.reply_count, run->info.fail_count, run->info.flags); if (pos + strlen(tmp) >= LOG_DUMP_SIZE) break; ret = sprintf(buf + pos, "%s", tmp); pos += ret; memset(tmp, 0, sizeof(tmp)); snprintf(tmp, 1024, "run_command %.1000s\n", run->command); if (pos + strlen(tmp) >= LOG_DUMP_SIZE) break; ret = sprintf(buf + pos, "%s", tmp); pos += ret; /* TODO: dump node results */ } *len = pos; } static void query_dump_run(int fd) { struct dlmc_header h; int len = 0; copy_run_list(copy_buf, &len); init_header(&h, DLMC_CMD_DUMP_RUN, NULL, 0, len); send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (len) send(fd, copy_buf, len, MSG_NOSIGNAL); } static void query_dump_debug(int fd) { struct dlmc_header h; int len = 0; copy_log_dump(copy_buf, &len); init_header(&h, DLMC_CMD_DUMP_DEBUG, NULL, 0, len); send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (len) send(fd, copy_buf, len, MSG_NOSIGNAL); } static void copy_options(char *buf, int *len) { struct dlm_option *o; char tmp[256]; int i, ret, pos = 0, l = 0; for (i = 0; i < dlm_options_max; i++) { o = &dlm_options[i]; memset(tmp, 0, sizeof(tmp)); if (o->req_arg == req_arg_str) l = snprintf(tmp, 240, "%s=%s", o->name, o->use_str); else if (o->req_arg == req_arg_uint) l = snprintf(tmp, 240, "%s=%u", o->name, o->use_uint); else l = snprintf(tmp, 240, "%s=%d", o->name, o->use_int); if (o->dynamic_set) snprintf(tmp + l, 15, " (set_config)\n"); else if (o->cli_set) snprintf(tmp + l, 15, " (cli option)\n"); else if (o->file_set) snprintf(tmp + l, 15, " (dlm.conf)\n"); else snprintf(tmp + l, 15, "\n"); if (pos + strlen(tmp) >= LOG_DUMP_SIZE) break; ret = sprintf(buf + pos, "%s", tmp); pos += ret; } *len = pos; } static void query_dump_config(int fd) { struct dlmc_header h; int len = 0; copy_options(copy_buf, &len); init_header(&h, DLMC_CMD_DUMP_CONFIG, NULL, 0, len); send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (len) send(fd, copy_buf, len, MSG_NOSIGNAL); } static void query_dump_log_plock(int fd) { struct dlmc_header h; int len = 0; copy_log_dump_plock(copy_buf, &len); init_header(&h, DLMC_CMD_DUMP_DEBUG, NULL, 0, len); send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (len) send(fd, copy_buf, len, MSG_NOSIGNAL); } static void query_dump_plocks(int fd, char *name) { struct lockspace *ls; struct dlmc_header h; int len = 0; int rv; ls = find_ls(name); if (!ls) { rv = -ENOENT; goto out; } rv = copy_plock_state(ls, copy_buf, &len); out: init_header(&h, DLMC_CMD_DUMP_PLOCKS, name, rv, len); send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (len) send(fd, copy_buf, len, MSG_NOSIGNAL); } /* combines a header and the data and sends it back to the client in a single do_write() call */ static void do_reply(int fd, int cmd, char *name, int result, int option, char *buf, int buflen) { struct dlmc_header *h; char *reply; int reply_len; reply_len = sizeof(struct dlmc_header) + buflen; reply = malloc(reply_len); if (!reply) return; memset(reply, 0, reply_len); h = (struct dlmc_header *)reply; init_header(h, cmd, name, result, buflen); h->option = option; if (buf && buflen) memcpy(reply + sizeof(struct dlmc_header), buf, buflen); do_write(fd, reply, reply_len); free(reply); } static void query_lockspace_info(int fd, char *name) { struct lockspace *ls; struct dlmc_lockspace lockspace; int rv; ls = find_ls(name); if (!ls) { rv = -ENOENT; goto out; } memset(&lockspace, 0, sizeof(lockspace)); rv = set_lockspace_info(ls, &lockspace); out: do_reply(fd, DLMC_CMD_LOCKSPACE_INFO, name, rv, 0, (char *)&lockspace, sizeof(lockspace)); } static void query_node_info(int fd, char *name, int nodeid) { struct lockspace *ls; struct dlmc_node node; int rv; ls = find_ls(name); if (!ls) { rv = -ENOENT; goto out; } memset(&node, 0, sizeof(node)); rv = set_node_info(ls, nodeid, &node); out: do_reply(fd, DLMC_CMD_NODE_INFO, name, rv, 0, (char *)&node, sizeof(node)); } static void query_lockspaces(int fd) { int ls_count = 0; struct dlmc_lockspace *lss = NULL; int rv, result; rv = set_lockspaces(&ls_count, &lss); if (rv < 0) { result = rv; ls_count = 0; goto out; } result = ls_count; out: do_reply(fd, DLMC_CMD_LOCKSPACES, NULL, result, 0, (char *)lss, ls_count * sizeof(struct dlmc_lockspace)); if (lss) free(lss); } static void query_lockspace_nodes(int fd, char *name, int option, int max) { struct lockspace *ls; int node_count = 0; struct dlmc_node *nodes = NULL; int rv, result; ls = find_ls(name); if (!ls) { result = -ENOENT; node_count = 0; goto out; } rv = set_lockspace_nodes(ls, option, &node_count, &nodes); if (rv < 0) { result = rv; node_count = 0; goto out; } /* node_count is the number of structs copied/returned; the caller's max may be less than that, in which case we copy as many as they asked for and return -E2BIG */ if (node_count > max) { result = -E2BIG; node_count = max; } else { result = node_count; } out: do_reply(fd, DLMC_CMD_LOCKSPACE_NODES, name, result, 0, (char *)nodes, node_count * sizeof(struct dlmc_node)); if (nodes) free(nodes); } static void process_connection(int ci) { struct dlmc_header h; char uuid_str[RUN_UUID_LEN]; char *extra = NULL; int rv, extra_len = 0; struct lockspace *ls; struct dlmc_run_check_state state; memset(uuid_str, 0, sizeof(uuid_str)); rv = do_read(client[ci].fd, &h, sizeof(h)); if (rv < 0) { log_debug("connection %d read error %d", ci, rv); goto out; } if (h.magic != DLMC_MAGIC) { log_debug("connection %d magic error %x", ci, h.magic); goto out; } if ((h.version & 0xFFFF0000) != (DLMC_VERSION & 0xFFFF0000)) { log_debug("connection %d version error %x", ci, h.version); goto out; } if (h.len > sizeof(h)) { extra_len = h.len - sizeof(h); extra = malloc(extra_len); if (!extra) { log_error("process_connection no mem %d", extra_len); goto out; } memset(extra, 0, extra_len); rv = do_read(client[ci].fd, extra, extra_len); if (rv < 0) { log_debug("connection %d extra read error %d", ci, rv); goto out; } } switch (h.command) { case DLMC_CMD_FENCE_ACK: fence_ack_node(atoi(h.name)); break; case DLMC_CMD_FS_REGISTER: if (opt(enable_fscontrol_ind)) { rv = fs_register_add(h.name); ls = find_ls(h.name); if (ls) ls->fs_registered = 1; } else { rv = -EOPNOTSUPP; } do_reply(client[ci].fd, DLMC_CMD_FS_REGISTER, h.name, rv, 0, NULL, 0); break; case DLMC_CMD_FS_UNREGISTER: fs_register_del(h.name); ls = find_ls(h.name); if (ls) ls->fs_registered = 0; break; case DLMC_CMD_FS_NOTIFIED: ls = find_ls(h.name); if (ls) rv = set_fs_notified(ls, h.data); else rv = -ENOENT; /* pass back the nodeid provided by caller in option field */ do_reply(client[ci].fd, DLMC_CMD_FS_NOTIFIED, h.name, rv, h.data, NULL, 0); break; case DLMC_CMD_RUN_START: if (!extra_len) rv = -EINVAL; else rv = start_run_operation(extra, extra_len, h.data, h.flags, uuid_str); do_reply(client[ci].fd, DLMC_CMD_RUN_START, uuid_str, rv, 0, NULL, 0); client_dead(ci); break; case DLMC_CMD_RUN_CHECK: memset(&state, 0, sizeof(state)); rv = check_run_operation(h.name, h.flags, &state); do_reply(client[ci].fd, DLMC_CMD_RUN_CHECK, NULL, rv, 0, (char *)&state, sizeof(state)); /* dlmc_run_check may retry checks on the same connection */ break; #if 0 case DLMC_CMD_DEADLOCK_CHECK: ls = find_ls(h.name); if (ls) send_cycle_start(ls); client_dead(ci); break; #endif case DLMC_CMD_RELOAD_CONFIG: set_opt_file(1); break; case DLMC_CMD_SET_CONFIG: if (extra_len) set_opt_online(extra, extra_len); break; default: log_error("process_connection %d unknown command %d", ci, h.command); } out: if (extra) free(extra); } static void process_listener(int ci) { int fd, i; fd = accept(client[ci].fd, NULL, NULL); if (fd < 0) { log_error("process_listener: accept error %d %d", fd, errno); return; } i = client_add(fd, process_connection, NULL); log_debug("client connection %d fd %d", i, fd); } static int setup_listener(const char *sock_path) { struct sockaddr_un addr; socklen_t addrlen; int rv, s; /* we listen for new client connections on socket s */ s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) { log_error("socket error %d %d", s, errno); return s; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_LOCAL; strcpy(&addr.sun_path[1], sock_path); addrlen = sizeof(sa_family_t) + strlen(addr.sun_path+1) + 1; rv = bind(s, (struct sockaddr *) &addr, addrlen); if (rv < 0) { log_error("bind error %d %d", rv, errno); close(s); return rv; } rv = listen(s, 5); if (rv < 0) { log_error("listen error %d %d", rv, errno); close(s); return rv; } return s; } static void query_lock(void) { pthread_mutex_lock(&query_mutex); } static void query_unlock(void) { pthread_mutex_unlock(&query_mutex); } /* This is a thread, so we have to be careful, don't call log_ functions. We need a thread to process queries because the main thread may block for long periods when writing to sysfs to stop dlm-kernel (any maybe other places). */ static void *process_queries(void *arg) { struct dlmc_header h; int s, f, rv; rv = setup_listener(DLMC_QUERY_SOCK_PATH); if (rv < 0) return NULL; s = rv; for (;;) { f = accept(s, NULL, NULL); if (f < 0) return NULL; rv = do_read(f, &h, sizeof(h)); if (rv < 0) { goto out; } if (h.magic != DLMC_MAGIC) { goto out; } if ((h.version & 0xFFFF0000) != (DLMC_VERSION & 0xFFFF0000)) { goto out; } query_lock(); switch (h.command) { case DLMC_CMD_DUMP_DEBUG: query_dump_debug(f); break; case DLMC_CMD_DUMP_CONFIG: query_dump_config(f); break; case DLMC_CMD_DUMP_LOG_PLOCK: query_dump_log_plock(f); break; case DLMC_CMD_DUMP_PLOCKS: query_dump_plocks(f, h.name); break; case DLMC_CMD_LOCKSPACE_INFO: query_lockspace_info(f, h.name); break; case DLMC_CMD_NODE_INFO: query_node_info(f, h.name, h.data); break; case DLMC_CMD_LOCKSPACES: query_lockspaces(f); break; case DLMC_CMD_LOCKSPACE_NODES: query_lockspace_nodes(f, h.name, h.option, h.data); break; case DLMC_CMD_DUMP_STATUS: send_state_daemon(f); send_state_daemon_nodes(f); send_state_startup_nodes(f); break; case DLMC_CMD_DUMP_RUN: query_dump_run(f); break; default: break; } query_unlock(); out: close(f); } } static int setup_queries(void) { int rv; pthread_mutex_init(&query_mutex, NULL); rv = pthread_create(&query_thread, NULL, process_queries, NULL); if (rv < 0) { log_error("can't create query thread"); return rv; } return 0; } /* The dlm in kernels before 2.6.28 do not have the monitor device. We keep this fd open as long as we're running. If we exit/terminate while lockspaces exist in the kernel, the kernel will detect a close on this fd and stop the lockspaces. */ static void setup_monitor(void) { if (!monitor_minor) return; kernel_monitor_fd = open("/dev/misc/dlm-monitor", O_RDONLY); log_debug("/dev/misc/dlm-monitor fd %d", kernel_monitor_fd); } void cluster_dead(int ci) { if (!cluster_down) log_error("cluster is down, exiting"); daemon_quit = 1; cluster_down = 1; } static int loop(void) { struct lockspace *ls; int poll_timeout = -1; int rv, i; void (*workfn) (int ci); void (*deadfn) (int ci); rv = setup_queries(); if (rv < 0) goto out; rv = setup_listener(DLMC_SOCK_PATH); if (rv < 0) goto out; client_add(rv, process_listener, NULL); rv = setup_cluster_cfg(); if (rv < 0) goto out; if (rv > 0) client_add(rv, process_cluster_cfg, cluster_dead); rv = check_uncontrolled_lockspaces(); if (rv < 0) goto out; /* * unfence needs to happen after checking for uncontrolled dlm kernel * state (for which we are probably currently fenced, the state must * be cleared by a reboot). unfence needs to happen before joining * the daemon cpg, after which it needs to be possible for someone to * fence us. */ rv = unfence_node(our_nodeid); if (rv < 0) goto out; rv = setup_node_config(); if (rv < 0) goto out; rv = setup_cluster(); if (rv < 0) goto out; client_add(rv, process_cluster, cluster_dead); rv = setup_misc_devices(); if (rv < 0) goto out; rv = setup_configfs_options(); if (rv < 0) goto out; setup_monitor(); rv = setup_configfs_members(); /* calls update_cluster() */ if (rv < 0) goto out; rv = setup_uevent(); if (rv < 0) goto out; client_add(rv, process_uevent, NULL); rv = setup_cpg_daemon(); if (rv < 0) goto out; client_add(rv, process_cpg_daemon, cluster_dead); rv = set_protocol(); if (rv < 0) goto out; #if 0 if (opt(enable_deadlk_ind)) { rv = setup_netlink(); if (rv < 0) goto out; client_add(rv, process_netlink, NULL); setup_deadlock(); } #endif rv = setup_plocks(); if (rv < 0) goto out; plock_fd = rv; plock_ci = client_add(rv, process_plocks, NULL); if (opt(enable_helper_ind)) helper_ci = client_add(helper_status_fd, process_helper, helper_dead); #ifdef USE_SD_NOTIFY sd_notify(0, "READY=1"); #endif /* We want to wait for our protocol to be set before we start to process fencing. */ daemon_fence_allow = 1; for (;;) { rv = poll(pollfd, client_maxi + 1, poll_timeout); if (rv == -1 && errno == EINTR) { if (daemon_quit && list_empty(&lockspaces)) { rv = 0; goto out; } if (daemon_quit) { log_error("shutdown ignored, active lockspaces"); daemon_quit = 0; } continue; } if (rv < 0) { log_error("poll errno %d", errno); goto out; } query_lock(); for (i = 0; i <= client_maxi; i++) { if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { deadfn = client[i].deadfn; deadfn(i); } } query_unlock(); if (daemon_quit) break; query_lock(); poll_timeout = -1; if (retry_fencing) { process_fencing_changes(); poll_timeout = 1000; } if (poll_lockspaces || poll_fs) { process_lockspace_changes(); poll_timeout = 1000; } if (poll_ignore_plock) { if (!limit_plocks()) { poll_ignore_plock = 0; client_back(plock_ci, plock_fd); } poll_timeout = 1000; } if (poll_drop_plock) { drop_resources_all(); if (poll_drop_plock) poll_timeout = 1000; } query_unlock(); } out: log_debug("shutdown"); close_plocks(); close_cpg_daemon(); clear_configfs(); close_cluster(); close_cluster_cfg(); list_for_each_entry(ls, &lockspaces, list) log_error("abandoned lockspace %s", ls->name); /* must be end */ close_logging(); return rv; } static int lockfile(const char *name) { char path[PATH_MAX]; char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; old_umask = umask(0022); rv = mkdir(SYS_VARDIR, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } rv = mkdir(SYS_RUNDIR, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } rv = mkdir(RUNDIR, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); snprintf(path, PATH_MAX, "%s/%s", RUNDIR, name); fd = open(path, O_CREAT|O_WRONLY|O_CLOEXEC, 0644); if (fd < 0) { log_error("lockfile open error %s: %s", path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } static void unlink_lockfile(int fd, const char *dir, const char *name) { char path[PATH_MAX]; snprintf(path, PATH_MAX, "%s/%s", dir, name); unlink(path); close(fd); } static const char *req_arg_s(int a) { switch (a) { case no_arg: return ""; case req_arg_bool: return "0|1"; case req_arg_int: return ""; case req_arg_str: return ""; default: return ""; } } static void print_usage(void) { struct dlm_option *o; int i; printf("Usage:\n"); printf("\n"); printf("dlm_controld [options]\n"); printf("\n"); printf("Option [arg]\n"); printf("Description [default]\n"); printf("\n"); for (i = 0; i < dlm_options_max; i++) { o = &dlm_options[i]; /* don't advertise options with no description */ if (!strlen(o->desc)) continue; printf(" --%s", o->name); if (o->letter) { printf(" | -%c", o->letter); if (o->req_arg) printf(" %s", req_arg_s(o->req_arg)); } else { if (o->req_arg) printf(" %s", req_arg_s(o->req_arg)); } printf("\n"); printf(" %s", o->desc); if (o->req_arg == req_arg_str) printf(" [%s]\n", o->default_str ? o->default_str : ""); else if (o->req_arg == req_arg_int) printf(" [%d]\n", o->default_int); else if (o->req_arg == req_arg_bool) printf(" [%d]\n", o->default_int); else if (o->req_arg == req_arg_uint) printf(" [%u]\n", o->default_uint); else if (o->req_arg == no_arg && !o->default_int) printf(" [0]\n"); else printf("\n"); printf("\n"); } } static void set_opt_default(int ind, const char *name, char letter, int arg_type, int default_int, const char *default_str, unsigned int default_uint, char reload, const char *desc) { dlm_options[ind].name = name; dlm_options[ind].letter = letter; dlm_options[ind].req_arg = arg_type; dlm_options[ind].desc = desc; dlm_options[ind].reload = reload; dlm_options[ind].default_int = default_int; dlm_options[ind].default_str = default_str; dlm_options[ind].default_uint = default_uint; dlm_options[ind].use_int = default_int; dlm_options[ind].use_str = (char *)default_str; dlm_options[ind].use_uint = default_uint; } static void set_opt_defaults(void) { set_opt_default(daemon_debug_ind, "daemon_debug", 'D', req_arg_bool, 0, NULL, 0, 1, "enable debugging to stderr and don't fork"); set_opt_default(foreground_ind, "foreground", '\0', req_arg_bool, 0, NULL, 0, 0, "don't fork"); set_opt_default(log_debug_ind, "log_debug", 'K', req_arg_bool, 0, NULL, 0, 1, "enable kernel dlm debugging messages"); set_opt_default(protocol_ind, "protocol", 'r', req_arg_str, -1, "detect", 0, 0, "dlm kernel lowcomms protocol: tcp, sctp, detect"); set_opt_default(port_ind, "port", 'R', req_arg_uint, -1, NULL, 21064, 0, "dlm kernel lowcomms protocol port"); set_opt_default(mark_ind, "mark", '\0', req_arg_uint, 0, NULL, 0, 0, "set mark value for DLM if not explicit by nodeid specified"); set_opt_default(debug_logfile_ind, "debug_logfile", 'L', req_arg_bool, 0, NULL, 0, 1, "write debugging to log file"); set_opt_default(enable_fscontrol_ind, "enable_fscontrol", '\0', req_arg_bool, 0, NULL, 0, 0, ""); /* do not advertise */ set_opt_default(enable_plock_ind, "enable_plock", 'p', req_arg_bool, 1, NULL, 0, 0, "enable/disable posix lock support for cluster fs"); set_opt_default(plock_debug_ind, "plock_debug", 'P', req_arg_bool, 0, NULL, 0, 1, "enable plock debugging"); set_opt_default(plock_rate_limit_ind, "plock_rate_limit", 'l', req_arg_int, 0, NULL, 0, 1, "limit rate of plock operations (0 for none)"); set_opt_default(plock_ownership_ind, "plock_ownership", 'o', req_arg_bool, 0, NULL, 0, 0, "enable/disable plock ownership"); set_opt_default(drop_resources_time_ind, "drop_resources_time", 't', req_arg_int, 10000, NULL, 0, 1, "plock ownership drop resources time (milliseconds)"); set_opt_default(drop_resources_count_ind, "drop_resources_count", 'c', req_arg_int, 10, NULL, 0, 1, "plock ownership drop resources count"); set_opt_default(drop_resources_age_ind, "drop_resources_age", 'a', req_arg_int, 10000, NULL, 0, 1, "plock ownership drop resources age (milliseconds)"); set_opt_default(post_join_delay_ind, "post_join_delay", 'j', req_arg_int, 30, NULL, 0, 1, "seconds to delay fencing after cluster join"); set_opt_default(enable_fencing_ind, "enable_fencing", 'f', req_arg_bool, 1, NULL, 0, 0, "enable/disable fencing"); set_opt_default(enable_concurrent_fencing_ind, "enable_concurrent_fencing", '\0', req_arg_bool, 0, NULL, 0, 0, "enable/disable concurrent fencing"); set_opt_default(enable_startup_fencing_ind, "enable_startup_fencing", 's', req_arg_bool, 1, NULL, 0, 0, "enable/disable startup fencing"); set_opt_default(repeat_failed_fencing_ind, "repeat_failed_fencing", '\0', req_arg_bool, 1, NULL, 0, 1, "enable/disable retrying after fencing fails"); set_opt_default(enable_quorum_fencing_ind, "enable_quorum_fencing", 'q', req_arg_bool, 1, NULL, 0, 1, "enable/disable quorum requirement for fencing"); set_opt_default(enable_quorum_lockspace_ind, "enable_quorum_lockspace", '\0', req_arg_bool, 1, NULL, 0, 1, "enable/disable quorum requirement for lockspace operations"); set_opt_default(enable_helper_ind, "enable_helper", '\0', req_arg_bool, 1, NULL, 0, 0, "enable/disable helper process for running commands"); set_opt_default(help_ind, "help", 'h', no_arg, -1, NULL, 0, 0, "print this help, then exit"); set_opt_default(version_ind, "version", 'V', no_arg, -1, NULL, 0, 0, "Print program version information, then exit"); } int get_ind_name(char *s) { char name[PATH_MAX]; char *p = s; int i; memset(name, 0, sizeof(name)); for (i = 0; i < strlen(s); i++) { if (*p == '=') break; if (*p == ' ') break; name[i] = *p; p++; } for (i = 0; i < dlm_options_max; i++) { if (!strcmp(dlm_options[i].name, name)) return i; } return -1; } static int get_ind_letter(char c) { int i; for (i = 0; i < dlm_options_max; i++) { if (dlm_options[i].letter == c) return i; } return -1; } struct dlm_option *get_dlm_option(char *name) { int i; i = get_ind_name(name); if (i < 0) return NULL; return &dlm_options[i]; } static void set_opt_cli(int argc, char **argv) { struct dlm_option *o; char *arg1, *p, *arg_str, *endptr; char bool_str[] = "1"; char bundled_letters[8]; int b, blc = 0, blc_max = 8; int debug_options = 0; int i, ind, bundled; if (argc < 2) return; arg1 = argv[1]; if (!strcmp(arg1, "help") || !strcmp(arg1, "--help") || !strcmp(arg1, "-h")) { print_usage(); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "version") || !strcmp(arg1, "--version") || !strcmp(arg1, "-V")) { printf("dlm_controld %s (built %s %s)\n", RELEASE_VERSION, __DATE__, __TIME__); printf("%s\n", REDHAT_COPYRIGHT); exit(EXIT_SUCCESS); } for (i = 1; i < argc; ) { p = argv[i++]; if (!strcmp(p, "--debug_options")) { debug_options = 1; continue; } if (p[0] == '-' && p[1] == '-') ind = get_ind_name(p + 2); else if (p[0] == '-') ind = get_ind_letter(p[1]); else { fprintf(stderr, "unknown option arg %s\n", p); exit(EXIT_FAILURE); } if (ind < 0) { fprintf(stderr, "unknown option %s\n", p); exit(EXIT_FAILURE); } o = &dlm_options[ind]; o->cli_set++; if (!o->req_arg || (o->req_arg == req_arg_bool)) { bundled = 0; /* current for no_arg type, there is not possible to have bundled options. * for req_arg_bool, bundled options, e.g. -DKP. all treat as "true". * below code save bundled, arg-less, single letters */ if ((p[0] == '-') && isalpha(p[1]) && (strlen(p) > 2)) { for (b = 2; b < strlen(p) && blc < blc_max; b++) { if (!isalpha(p[b])) break; bundled_letters[blc++] = p[b]; bundled = 1; } } if (bundled) { /* "-x" has same effect as "-x 1" */ o->cli_int = 1; o->use_int = 1; continue; } } arg_str = NULL; if (strstr(p, "=")) { /* arg starts after = for name or letter */ arg_str = strstr(p, "=") + 1; } else if (strlen(p) > 2 && isalpha(p[1]) && isdigit(p[2])) { /* arg with no space between letter and digits */ arg_str = p + 2; } else { /* space separates arg from name or letter */ if (o->req_arg == req_arg_bool) { /* bool type treat empty arg as true */ if (i >= argc || argv[i][0] == '-') arg_str = bool_str; else arg_str = argv[i++]; } else { if (i >= argc) { fprintf(stderr, "option %s no arg\n", p); exit(EXIT_FAILURE); } arg_str = argv[i++]; } } if (!arg_str || arg_str[0] == '-' || arg_str[0] == '\0') { fprintf(stderr, "option %s requires arg\n", p); exit(EXIT_FAILURE); } if ((o->req_arg != req_arg_str) && !strtol(arg_str, &endptr, 10)) { if (endptr == arg_str) { fprintf(stderr, "option %s requires digit number\n", p); exit(EXIT_FAILURE); } } if (o->req_arg == req_arg_str) { o->cli_str = strdup(arg_str); o->use_str = o->cli_str; } else if (o->req_arg == req_arg_int) { o->cli_int = atoi(arg_str); o->use_int = o->cli_int; } else if (o->req_arg == req_arg_bool) { o->cli_int = atoi(arg_str) ? 1 : 0; o->use_int = o->cli_int; } else if (o->req_arg == req_arg_uint) { o->cli_uint = strtoul(arg_str, NULL, 0); o->use_uint = o->cli_uint; } } /* process bundled letters saved above */ for (i = 0; i < blc; i++) { ind = get_ind_letter(bundled_letters[i]); if (ind < 0) { fprintf(stderr, "unknown option char %c\n", bundled_letters[i]); exit(EXIT_FAILURE); } /* bundled letter must be bool type, treat it with "true" value */ o = &dlm_options[ind]; o->cli_set++; o->cli_int = 1; o->use_int = 1; } if (debug_options && opt(daemon_debug_ind)) { for (i = 0; i < dlm_options_max; i++) { o = &dlm_options[i]; printf("%-25s cli_set %d cli_int %d cli_str %s use_int %d use_str %s\n", o->name, o->cli_set, o->cli_int, o->cli_str, o->use_int, o->use_str); } } if (getenv("DLM_CONTROLD_DEBUG")) { dlm_options[daemon_debug_ind].use_int = 1; } } #if 0 /* When this is used, the systemd service file needs ControlGroup=cpu:/ */ static void set_scheduler(void) { struct sched_param sched_param; int rv; rv = sched_get_priority_max(SCHED_RR); if (rv != -1) { sched_param.sched_priority = rv; rv = sched_setscheduler(0, SCHED_RR, &sched_param); if (rv == -1) log_error("could not set SCHED_RR priority %d err %d", sched_param.sched_priority, errno); } else { log_error("could not get maximum scheduler priority err %d", errno); } } #endif int main(int argc, char **argv) { struct sigaction act; int fd, rv; /* * config priority: cli, config file, default * - explicit cli setting will override default, * - explicit file setting will override default * - explicit file setting will not override explicit cli setting * * "dlm reload_config" will trigger to reload config file, and * reload action also follows the rule: not override explicit * cli setting */ set_opt_defaults(); set_opt_cli(argc, argv); set_opt_file(0); rv = node_config_init(CONF_FILE_PATH); if (rv) return 1; strcpy(fence_all_device.name, "fence_all"); strcpy(fence_all_device.agent, "dlm_stonith"); fence_all_device.unfence = 0; INIT_LIST_HEAD(&lockspaces); INIT_LIST_HEAD(&fs_register_list); INIT_LIST_HEAD(&run_ops); init_daemon(); if (!opt(daemon_debug_ind) && !opt(foreground_ind)) { if (daemon(0, 0) < 0) { perror("daemon error"); exit(EXIT_FAILURE); } } init_logging(); fd = lockfile(RUN_FILE_NAME); if (fd < 0) return 1; log_level(NULL, LOG_INFO, "dlm_controld %s started", RELEASE_VERSION); if (opt(enable_helper_ind)) setup_helper(); memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; rv = sigaction(SIGTERM, &act, NULL); if (rv < 0) goto out; rv = sigaction(SIGINT, &act, NULL); if (rv < 0) goto out; memset(&act, 0, sizeof(act)); act.sa_handler = SIG_IGN; rv = sigaction(SIGHUP, &act, NULL); if (rv < 0) goto out; memset(&act, 0, sizeof(act)); act.sa_handler = sigchld_handler; act.sa_flags = SA_NOCLDSTOP; rv = sigaction(SIGCHLD, &act, NULL); if (rv < 0) goto out; /* set_scheduler(); */ rv = loop(); out: unlink_lockfile(fd, RUNDIR, RUN_FILE_NAME); return rv < 0 ? 1 : 0; } dlm-4.3.0/dlm_controld/member.c000066400000000000000000000247371461150666200164110ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #include #include #include #include #include static corosync_cfg_handle_t ch; static quorum_handle_t qh; static uint32_t old_nodes[MAX_NODES]; static int old_node_count; static uint32_t quorum_nodes[MAX_NODES]; static int quorum_node_count; static struct list_head cluster_nodes; static uint32_t leavejoin_nodes[MAX_NODES]; static int leavejoin_count; struct node_cluster { struct list_head list; int nodeid; uint64_t cluster_add_time; uint64_t cluster_rem_time; }; static struct node_cluster *get_cluster_node(int nodeid, int create) { struct node_cluster *node; list_for_each_entry(node, &cluster_nodes, list) { if (node->nodeid == nodeid) return node; } if (!create) return NULL; node = malloc(sizeof(struct node_cluster)); if (!node) return NULL; memset(node, 0, sizeof(struct node_cluster)); node->nodeid = nodeid; list_add(&node->list, &cluster_nodes); return node; } static void add_cluster_node(int nodeid, uint64_t now) { struct node_cluster *node; node = get_cluster_node(nodeid, 1); if (!node) return; node->cluster_add_time = now; } static void rem_cluster_node(int nodeid, uint64_t now) { struct node_cluster *node; node = get_cluster_node(nodeid, 0); if (!node) return; node->cluster_rem_time = now; } uint64_t cluster_add_time(int nodeid) { struct node_cluster *node; node = get_cluster_node(nodeid, 0); if (!node) return 0; return node->cluster_add_time; } static int is_member(uint32_t *node_list, int count, uint32_t nodeid) { int i; for (i = 0; i < count; i++) { if (node_list[i] == nodeid) return 1; } return 0; } static int is_old_member(uint32_t nodeid) { return is_member(old_nodes, old_node_count, nodeid); } int is_cluster_member(uint32_t nodeid) { return is_member(quorum_nodes, quorum_node_count, nodeid); } static int is_leavejoin_node(uint32_t nodeid) { return is_member(leavejoin_nodes, leavejoin_count, nodeid); } static void quorum_nodelist_callback(quorum_handle_t cbhandle, struct quorum_ring_id ring_id, uint32_t member_list_entries, const uint32_t *member_list, uint32_t joined_list_entries, const uint32_t *joined_list, uint32_t left_list_entries, const uint32_t *left_list) { uint64_t ring_seq = ring_id.seq; int i, j; for (i = 0; i < left_list_entries; i++) { log_debug("cluster left_list %u seq %llu", left_list[i], (unsigned long long)ring_seq); } for (j = 0; j < joined_list_entries; j++) { log_debug("cluster joined_list %u seq %llu", joined_list[j], (unsigned long long)ring_seq); } for (i = 0; i < left_list_entries; i++) { for (j = 0; j < joined_list_entries; j++) { if (joined_list[j] == left_list[i]) { log_debug("cluster node %d left and joined", joined_list[j]); if (!is_leavejoin_node(joined_list[j])) leavejoin_nodes[leavejoin_count++] = joined_list[j]; } } } } static void quorum_callback(quorum_handle_t cbhandle, uint32_t quorate, struct quorum_ring_id ring_id, uint32_t node_list_entries, const uint32_t *node_list) { corosync_cfg_node_address_t addrs[MAX_NODE_ADDRESSES]; corosync_cfg_node_address_t *addrptr = addrs; const struct node_config *nc; cs_error_t err; int i, j, num_addrs; uint32_t nodeid; uint64_t ring_seq = ring_id.seq; uint64_t now = monotime(); if (!cluster_joined_monotime) { cluster_joined_monotime = now; cluster_joined_walltime = time(NULL); } if (!cluster_quorate && quorate) cluster_quorate_monotime = now; cluster_quorate = quorate; cluster_ringid_seq = ring_seq; log_debug("cluster quorum %u seq %llu nodes %u", cluster_quorate, (unsigned long long)cluster_ringid_seq, node_list_entries); old_node_count = quorum_node_count; memcpy(&old_nodes, &quorum_nodes, sizeof(old_nodes)); quorum_node_count = 0; memset(&quorum_nodes, 0, sizeof(quorum_nodes)); memset(&addrs, 0, sizeof(addrs)); for (i = 0; i < node_list_entries; i++) quorum_nodes[quorum_node_count++] = node_list[i]; for (i = 0; i < old_node_count; i++) { if (!is_cluster_member(old_nodes[i])) { log_debug("cluster node %u removed seq %llu", old_nodes[i], (unsigned long long)cluster_ringid_seq); rem_cluster_node(old_nodes[i], now); del_configfs_node(old_nodes[i]); } } for (i = 0; i < leavejoin_count; i++) { nodeid = leavejoin_nodes[i]; log_debug("cluster node %u leavejoin seq %llu", nodeid, (unsigned long long)cluster_ringid_seq); /* remove */ rem_cluster_node(nodeid, now); del_configfs_node(nodeid); /* add */ add_cluster_node(nodeid, now); fence_delay_begin = now; err = corosync_cfg_get_node_addrs(ch, nodeid, MAX_NODE_ADDRESSES, &num_addrs, addrs); if (err != CS_OK) { log_error("corosync_cfg_get_node_addrs failed nodeid %u", nodeid); continue; } nc = node_config_get(nodeid); for (j = 0; j < num_addrs; j++) { add_configfs_node(nodeid, addrptr[j].address, addrptr[j].address_length, (nodeid == our_nodeid), nc->mark); } } for (i = 0; i < quorum_node_count; i++) { if (is_leavejoin_node(quorum_nodes[i])) continue; if (!is_old_member(quorum_nodes[i])) { log_debug("cluster node %u added seq %llu", quorum_nodes[i], (unsigned long long)cluster_ringid_seq); add_cluster_node(quorum_nodes[i], now); fence_delay_begin = now; err = corosync_cfg_get_node_addrs(ch, quorum_nodes[i], MAX_NODE_ADDRESSES, &num_addrs, addrs); if (err != CS_OK) { log_error("corosync_cfg_get_node_addrs failed " "nodeid %u", quorum_nodes[i]); continue; } nc = node_config_get(quorum_nodes[i]); for (j = 0; j < num_addrs; j++) { add_configfs_node(quorum_nodes[i], addrptr[j].address, addrptr[j].address_length, (quorum_nodes[i] == our_nodeid), nc->mark); } } } memset(leavejoin_nodes, 0, sizeof(leavejoin_nodes)); leavejoin_count = 0; } void process_cluster(int ci) { cs_error_t err; err = quorum_dispatch(qh, CS_DISPATCH_ALL); if (err != CS_OK) { log_error("process_cluster quorum_dispatch %d", err); cluster_dead(0); } } /* Force re-read of quorum nodes */ void update_cluster(void) { cs_error_t err; err = quorum_dispatch(qh, CS_DISPATCH_ONE); if (err != CS_OK) { log_error("update_cluster quorum_dispatch %d", err); cluster_dead(0); } } int setup_cluster(void) { quorum_model_v1_data_t model_data; cs_error_t err; int fd; uint32_t quorum_type = 0; INIT_LIST_HEAD(&cluster_nodes); memset(&model_data, 0, sizeof(model_data)); model_data.quorum_notify_fn = quorum_callback; model_data.nodelist_notify_fn = quorum_nodelist_callback; err = quorum_model_initialize(&qh, QUORUM_MODEL_V1, (quorum_model_data_t *)&model_data, &quorum_type, NULL); if (err != CS_OK) { log_error("quorum init error %d", err); return -1; } err = quorum_fd_get(qh, &fd); if (err != CS_OK) { log_error("quorum fd_get error %d", err); goto fail; } err = quorum_trackstart(qh, CS_TRACK_CHANGES); if (err != CS_OK) { log_error("quorum trackstart error %d", err); goto fail; } old_node_count = 0; memset(&old_nodes, 0, sizeof(old_nodes)); quorum_node_count = 0; memset(&quorum_nodes, 0, sizeof(quorum_nodes)); return fd; fail: quorum_finalize(qh); return -1; } void close_cluster(void) { quorum_trackstop(qh); quorum_finalize(qh); } void kick_node_from_cluster(int nodeid) { if (!nodeid) { log_error("telling corosync to shut down cluster locally"); corosync_cfg_try_shutdown(ch, COROSYNC_CFG_SHUTDOWN_FLAG_IMMEDIATE); } else { log_error("tell corosync to remove nodeid %d from cluster", nodeid); corosync_cfg_kill_node(ch, nodeid, "dlm_controld"); } } static void shutdown_callback(corosync_cfg_handle_t h, corosync_cfg_shutdown_flags_t flags) { if (flags == COROSYNC_CFG_SHUTDOWN_FLAG_REQUEST) { if (list_empty(&lockspaces)) { log_debug("shutdown request yes"); corosync_cfg_replyto_shutdown(ch, COROSYNC_CFG_SHUTDOWN_FLAG_YES); } else { log_debug("shutdown request no"); corosync_cfg_replyto_shutdown(ch, COROSYNC_CFG_SHUTDOWN_FLAG_NO); } } } static corosync_cfg_callbacks_t cfg_callbacks = { .corosync_cfg_shutdown_callback = shutdown_callback, }; void process_cluster_cfg(int ci) { cs_error_t err; err = corosync_cfg_dispatch(ch, CS_DISPATCH_ALL); if (err != CS_OK) { log_error("process_cluster_cfg cfg_dispatch %d", err); cluster_dead(0); } } int setup_cluster_cfg(void) { cs_error_t err; unsigned int nodeid; int fd; int retry = 1; try_again: err = corosync_cfg_initialize(&ch, &cfg_callbacks); if (err != CS_OK) { if ((err == CS_ERR_TRY_AGAIN) && (retry <= 10)) { log_error("corosync has not completed initialization.. retry %d", retry); sleep(1); retry++; goto try_again; } log_error("corosync cfg init error %d", err); return -1; } err = corosync_cfg_fd_get(ch, &fd); if (err != CS_OK) { log_error("corosync cfg fd_get error %d", err); corosync_cfg_finalize(ch); return -1; } err = corosync_cfg_local_get(ch, &nodeid); if (err != CS_OK) { log_error("corosync cfg local_get error %d", err); corosync_cfg_finalize(ch); return -1; } our_nodeid = nodeid; log_debug("our_nodeid %d", our_nodeid); if (our_nodeid < 0) { log_error("negative nodeid, set corosync totem.clear_node_high_bit"); corosync_cfg_finalize(ch); return -1; } return fd; } void close_cluster_cfg(void) { corosync_cfg_finalize(ch); } int setup_node_config(void) { char key[CMAP_KEYNAME_MAXLEN+1]; cmap_handle_t h; cs_error_t err; uint32_t nodeid; uint8_t two_node = 0; int node_count = 0; int i; err = cmap_initialize(&h); if (err != CS_OK) { log_error("corosync cmap init error %d", err); return -1; } for (i = 0; i < MAX_NODES; i++) { snprintf(key, CMAP_KEYNAME_MAXLEN, "nodelist.node.%d.nodeid", i); err = cmap_get_uint32(h, key, &nodeid); if (err != CS_OK) break; log_debug("node_config %d", nodeid); node_count++; if (opt(enable_fencing_ind) && opt(enable_startup_fencing_ind)) add_startup_node(nodeid); } memset(key, 0, sizeof(key)); snprintf(key, CMAP_KEYNAME_MAXLEN, "quorum.two_node"); err = cmap_get_uint8(h, key, &two_node); if (err != CS_OK) goto out; if (node_count == 2 && two_node) cluster_two_node = 1; out: cmap_finalize(h); return 0; } dlm-4.3.0/dlm_controld/node_config.c000066400000000000000000000034341461150666200174030ustar00rootroot00000000000000/* * Copyright 2020 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #define MAX_LINE 4096 static struct node_config nc[MAX_NODES]; static const struct node_config nc_default = { .mark = 0, }; int node_config_init(const char *path) { char line[MAX_LINE], tmp[MAX_LINE]; unsigned long mark; FILE *file; int nodeid; int rv; /* if no config file is given we assume default node configuration */ file = fopen(path, "r"); if (!file) { log_debug("No config file %s, we assume default node configuration: mark %" PRIu32, path, nc_default.mark); return 0; } while (fgets(line, MAX_LINE, file)) { if (line[0] == '#') continue; if (line[0] == '\n') continue; if (!strncmp(line, "node", strlen("node"))) { rv = sscanf(line, "node id=%d mark=%s", &nodeid, tmp); if (rv < 2) { log_error("Invalid configuration line: %s", line); rv = -EINVAL; goto out; } /* skip invalid nodeid's */ if (nodeid <= 0 || nodeid > MAX_NODES - 1) continue; mark = strtoul(tmp, NULL, 0); if (mark == ULONG_MAX) { log_error("Failed to pars mark value %s will use %" PRIu32, tmp, nc_default.mark); mark = nc_default.mark; } nc[nodeid].mark = mark; log_debug("parsed node config id=%d mark=%llu", nodeid, (unsigned long long)mark); } } fclose(file); return 0; out: fclose(file); return rv; } const struct node_config *node_config_get(int nodeid) { if (nodeid <= 0 || nodeid > MAX_NODES - 1) { log_debug("node config requested for id=%d returning defaults", nodeid); return &nc_default; } return &nc[nodeid]; } dlm-4.3.0/dlm_controld/node_config.h000066400000000000000000000012671461150666200174120ustar00rootroot00000000000000/* * Copyright 2020 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef _NODE_CONFIG_H_ #define _NODE_CONFIG_H_ #include struct node_config { uint32_t mark; }; /* * Returns -ENOENT if path does not exist or there is no * config for nodeid in the file. * * Returns -EXYZ if there's a problem with the config. * * Returns 0 if a config was found with no problems. */ int node_config_init(const char *path); const struct node_config *node_config_get(int nodeid); #endif dlm-4.3.0/dlm_controld/plock.c000066400000000000000000001444541461150666200162510ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include "dlm_daemon.h" #include #ifndef DLM_PLOCK_OP_CANCEL #define DLM_PLOCK_OP_CANCEL 4 #endif static uint32_t plock_read_count; static uint32_t plock_recv_count; static uint32_t plock_rate_delays; static struct timeval plock_read_time; static struct timeval plock_recv_time; static struct timeval plock_rate_last; static int plock_device_fd = -1; #define RD_CONTINUE 0x00000001 struct resource_data { uint64_t number; int owner; uint32_t lock_count; uint32_t flags; uint32_t pad; }; struct plock_data { uint64_t start; uint64_t end; uint64_t owner; uint32_t pid; uint32_t nodeid; uint8_t ex; uint8_t waiter; uint16_t pad1; uint32_t pad; }; #define R_GOT_UNOWN 0x00000001 /* have received owner=0 message */ #define R_SEND_UNOWN 0x00000002 /* have sent owner=0 message */ #define R_SEND_OWN 0x00000004 /* have sent owner=our_nodeid message */ #define R_PURGE_UNOWN 0x00000008 /* set owner=0 in purge */ #define R_SEND_DROP 0x00000010 struct resource { struct list_head list; /* list of resources */ uint64_t number; int owner; /* nodeid or 0 for unowned */ uint32_t flags; struct timeval last_access; struct list_head locks; /* one lock for each range */ struct list_head waiters; struct list_head pending; /* discovering r owner */ struct rb_node rb_node; }; #define P_SYNCING 0x00000001 /* plock has been sent as part of sync but not yet received */ struct posix_lock { struct list_head list; /* resource locks or waiters list */ uint32_t pid; uint64_t owner; uint64_t start; uint64_t end; int ex; int nodeid; uint32_t flags; }; struct lock_waiter { struct list_head list; uint32_t flags; struct dlm_plock_info info; }; struct save_msg { struct list_head list; int nodeid; int len; int type; char buf[0]; }; static void send_own(struct lockspace *ls, struct resource *r, int owner); static void save_pending_plock(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in); static int got_unown(struct resource *r) { return !!(r->flags & R_GOT_UNOWN); } static void info_bswap_out(struct dlm_plock_info *i) { i->version[0] = cpu_to_le32(i->version[0]); i->version[1] = cpu_to_le32(i->version[1]); i->version[2] = cpu_to_le32(i->version[2]); i->pid = cpu_to_le32(i->pid); i->nodeid = cpu_to_le32(i->nodeid); i->rv = cpu_to_le32(i->rv); i->fsid = cpu_to_le32(i->fsid); i->number = cpu_to_le64(i->number); i->start = cpu_to_le64(i->start); i->end = cpu_to_le64(i->end); i->owner = cpu_to_le64(i->owner); } static void info_bswap_in(struct dlm_plock_info *i) { i->version[0] = le32_to_cpu(i->version[0]); i->version[1] = le32_to_cpu(i->version[1]); i->version[2] = le32_to_cpu(i->version[2]); i->pid = le32_to_cpu(i->pid); i->nodeid = le32_to_cpu(i->nodeid); i->rv = le32_to_cpu(i->rv); i->fsid = le32_to_cpu(i->fsid); i->number = le64_to_cpu(i->number); i->start = le64_to_cpu(i->start); i->end = le64_to_cpu(i->end); i->owner = le64_to_cpu(i->owner); } static const char *op_str(int optype) { switch (optype) { case DLM_PLOCK_OP_LOCK: return "LK"; case DLM_PLOCK_OP_CANCEL: return "CL"; case DLM_PLOCK_OP_UNLOCK: return "UN"; case DLM_PLOCK_OP_GET: return "GET"; default: return "??"; } } static const char *ex_str(int optype, int ex) { if (optype == DLM_PLOCK_OP_UNLOCK || optype == DLM_PLOCK_OP_GET) return "-"; if (ex) return "WR"; else return "RD"; } int setup_plocks(void) { plock_read_count = 0; plock_recv_count = 0; plock_rate_delays = 0; gettimeofday(&plock_read_time, NULL); gettimeofday(&plock_recv_time, NULL); gettimeofday(&plock_rate_last, NULL); if (plock_minor) { plock_device_fd = open("/dev/misc/dlm_plock", O_RDWR); } if (plock_device_fd < 0) { log_error("Failure to open plock device: %s", strerror(errno)); return -1; } log_debug("plocks %d", plock_device_fd); return plock_device_fd; } void close_plocks(void) { if (plock_device_fd > 0) close(plock_device_fd); } /* FIXME: unify these two */ static unsigned long time_diff_ms(struct timeval *begin, struct timeval *end) { struct timeval result; timersub(end, begin, &result); return (result.tv_sec * 1000) + (result.tv_usec / 1000); } static uint64_t dt_usec(const struct timeval *start, const struct timeval *stop) { uint64_t dt; dt = stop->tv_sec - start->tv_sec; dt *= 1000000; dt += stop->tv_usec - start->tv_usec; return dt; } static struct resource * rb_search_plock_resource(struct lockspace *ls, uint64_t number) { struct rb_node *n = ls->plock_resources_root.rb_node; struct resource *r; while (n) { r = rb_entry(n, struct resource, rb_node); if (number < r->number) n = n->rb_left; else if (number > r->number) n = n->rb_right; else return r; } return NULL; } static void rb_insert_plock_resource(struct lockspace *ls, struct resource *r) { struct resource *entry; struct rb_node **p; struct rb_node *parent = NULL; p = &ls->plock_resources_root.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct resource, rb_node); if (r->number < entry->number) p = &parent->rb_left; else if (r->number > entry->number) p = &parent->rb_right; else return; } rb_link_node(&r->rb_node, parent, p); rb_insert_color(&r->rb_node, &ls->plock_resources_root); } static void rb_del_plock_resource(struct lockspace *ls, struct resource *r) { if (!RB_EMPTY_NODE(&r->rb_node)) { rb_erase(&r->rb_node, &ls->plock_resources_root); RB_CLEAR_NODE(&r->rb_node); } } static struct resource *search_resource(struct lockspace *ls, uint64_t number) { struct resource *r; list_for_each_entry(r, &ls->plock_resources, list) { if (r->number == number) return r; } return NULL; } static int find_resource(struct lockspace *ls, uint64_t number, int create, struct resource **r_out) { struct resource *r = NULL; int rv = 0; r = rb_search_plock_resource(ls, number); if (r) goto out; if (create == 0) { rv = -ENOENT; goto out; } r = malloc(sizeof(struct resource)); if (!r) { log_elock(ls, "find_resource no memory %d", errno); rv = -ENOMEM; goto out; } memset(r, 0, sizeof(struct resource)); r->number = number; INIT_LIST_HEAD(&r->locks); INIT_LIST_HEAD(&r->waiters); INIT_LIST_HEAD(&r->pending); if (opt(plock_ownership_ind)) r->owner = -1; else r->owner = 0; list_add_tail(&r->list, &ls->plock_resources); rb_insert_plock_resource(ls, r); out: if (r) gettimeofday(&r->last_access, NULL); *r_out = r; return rv; } static void put_resource(struct lockspace *ls, struct resource *r) { /* with ownership, resources are only freed via drop messages */ if (opt(plock_ownership_ind)) return; if (list_empty(&r->locks) && list_empty(&r->waiters)) { rb_del_plock_resource(ls, r); list_del(&r->list); free(r); } } static inline int ranges_overlap(uint64_t start1, uint64_t end1, uint64_t start2, uint64_t end2) { if (end1 < start2 || start1 > end2) return 0; return 1; } /** * overlap_type - returns a value based on the type of overlap * @s1 - start of new lock range * @e1 - end of new lock range * @s2 - start of existing lock range * @e2 - end of existing lock range * */ static int overlap_type(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2) { int ret; /* * ---r1--- * ---r2--- */ if (s1 == s2 && e1 == e2) ret = 0; /* * --r1-- * ---r2--- */ else if (s1 == s2 && e1 < e2) ret = 1; /* * --r1-- * ---r2--- */ else if (s1 > s2 && e1 == e2) ret = 1; /* * --r1-- * ---r2--- */ else if (s1 > s2 && e1 < e2) ret = 2; /* * ---r1--- or ---r1--- or ---r1--- * --r2-- --r2-- --r2-- */ else if (s1 <= s2 && e1 >= e2) ret = 3; /* * ---r1--- * ---r2--- */ else if (s1 > s2 && e1 > e2) ret = 4; /* * ---r1--- * ---r2--- */ else if (s1 < s2 && e1 < e2) ret = 4; else ret = -1; return ret; } /* shrink the range start2:end2 by the partially overlapping start:end */ static int shrink_range2(uint64_t *start2, uint64_t *end2, uint64_t start, uint64_t end) { int error = 0; if (*start2 < start) *end2 = start - 1; else if (*end2 > end) *start2 = end + 1; else error = -1; return error; } static int shrink_range(struct posix_lock *po, uint64_t start, uint64_t end) { return shrink_range2(&po->start, &po->end, start, end); } static int is_conflict(struct resource *r, struct dlm_plock_info *in, int get) { struct posix_lock *po; list_for_each_entry(po, &r->locks, list) { if (po->nodeid == in->nodeid && po->owner == in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; if (in->ex || po->ex) { if (get) { in->ex = po->ex; in->pid = po->pid; in->start = po->start; in->end = po->end; } return 1; } } return 0; } static int add_lock(struct resource *r, uint32_t nodeid, uint64_t owner, uint32_t pid, int ex, uint64_t start, uint64_t end) { struct posix_lock *po; po = malloc(sizeof(struct posix_lock)); if (!po) return -ENOMEM; po->start = start; po->end = end; po->nodeid = nodeid; po->owner = owner; po->pid = pid; po->ex = ex; po->flags = 0; list_add_tail(&po->list, &r->locks); return 0; } /* RN within RE (and starts or ends on RE boundary) 1. add new lock for non-overlap area of RE, orig mode 2. convert RE to RN range and mode */ static int lock_case1(struct posix_lock *po, struct resource *r, struct dlm_plock_info *in) { uint64_t start2, end2; int rv; /* non-overlapping area start2:end2 */ start2 = po->start; end2 = po->end; rv = shrink_range2(&start2, &end2, in->start, in->end); if (rv) goto out; po->start = in->start; po->end = in->end; po->ex = in->ex; rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2); out: return rv; } /* RN within RE (RE overlaps RN on both sides) 1. add new lock for front fragment, orig mode 2. add new lock for back fragment, orig mode 3. convert RE to RN range and mode */ static int lock_case2(struct posix_lock *po, struct resource *r, struct dlm_plock_info *in) { int rv; rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, po->start, in->start - 1); if (rv) goto out; rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, in->end + 1, po->end); if (rv) goto out; po->start = in->start; po->end = in->end; po->ex = in->ex; out: return rv; } static int lock_internal(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct posix_lock *po, *safe; int rv = 0; list_for_each_entry_safe(po, safe, &r->locks, list) { if (po->nodeid != in->nodeid || po->owner != in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; /* existing range (RE) overlaps new range (RN) */ switch(overlap_type(in->start, in->end, po->start, po->end)) { case 0: if (po->ex == in->ex) goto out; /* ranges the same - just update the existing lock */ po->ex = in->ex; goto out; case 1: if (po->ex == in->ex) goto out; rv = lock_case1(po, r, in); goto out; case 2: if (po->ex == in->ex) goto out; rv = lock_case2(po, r, in); goto out; case 3: list_del(&po->list); free(po); break; case 4: if (po->start < in->start) po->end = in->start - 1; else po->start = in->end + 1; break; default: rv = -1; goto out; } } rv = add_lock(r, in->nodeid, in->owner, in->pid, in->ex, in->start, in->end); out: return rv; } static int unlock_internal(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct posix_lock *po, *safe; int rv = 0; list_for_each_entry_safe(po, safe, &r->locks, list) { if (po->nodeid != in->nodeid || po->owner != in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; /* existing range (RE) overlaps new range (RN) */ switch (overlap_type(in->start, in->end, po->start, po->end)) { case 0: /* ranges the same - just remove the existing lock */ list_del(&po->list); free(po); goto out; case 1: /* RN within RE and starts or ends on RE boundary - * shrink and update RE */ rv = shrink_range(po, in->start, in->end); goto out; case 2: /* RN within RE - shrink and update RE to be front * fragment, and add a new lock for back fragment */ rv = add_lock(r, in->nodeid, in->owner, in->pid, po->ex, in->end + 1, po->end); po->end = in->start - 1; goto out; case 3: /* RE within RN - remove RE, then continue checking * because RN could cover other locks */ list_del(&po->list); free(po); continue; case 4: /* front of RE in RN, or end of RE in RN - shrink and * update RE, then continue because RN could cover * other locks */ rv = shrink_range(po, in->start, in->end); continue; default: rv = -1; goto out; } } out: return rv; } static void clear_waiters(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct lock_waiter *w, *safe; list_for_each_entry_safe(w, safe, &r->waiters, list) { if (w->info.nodeid != in->nodeid || w->info.owner != in->owner) continue; list_del(&w->list); log_dlock(ls, "clear waiter %llx %llx-%llx %d/%u/%llx", (unsigned long long)in->number, (unsigned long long)in->start, (unsigned long long)in->end, in->nodeid, in->pid, (unsigned long long)in->owner); free(w); } } static int add_waiter(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct lock_waiter *w; w = malloc(sizeof(struct lock_waiter)); if (!w) return -ENOMEM; memcpy(&w->info, in, sizeof(struct dlm_plock_info)); w->flags = 0; list_add_tail(&w->list, &r->waiters); return 0; } static void write_result(struct dlm_plock_info *in, int rv) { int write_rv; in->rv = rv; write_rv = write(plock_device_fd, in, sizeof(struct dlm_plock_info)); if (write_rv < 0) log_debug("write_result: write error %d fd %d\n", errno, plock_device_fd); } static void do_waiters(struct lockspace *ls, struct resource *r) { struct lock_waiter *w, *safe; struct dlm_plock_info *in; int rv; list_for_each_entry_safe(w, safe, &r->waiters, list) { in = &w->info; if (is_conflict(r, in, 0)) continue; list_del(&w->list); /* log_group(ls, "take waiter %llx %llx-%llx %d/%u/%llx", in->number, in->start, in->end, in->nodeid, in->pid, in->owner); */ rv = lock_internal(ls, r, in); if (in->nodeid == our_nodeid) write_result(in, rv); free(w); } } static void do_lock(struct lockspace *ls, struct dlm_plock_info *in, struct resource *r) { int rv; if (is_conflict(r, in, 0)) { if (!in->wait) rv = -EAGAIN; else { rv = add_waiter(ls, r, in); if (rv) goto out; rv = -EINPROGRESS; } } else rv = lock_internal(ls, r, in); out: if (in->nodeid == our_nodeid && rv != -EINPROGRESS) write_result(in, rv); do_waiters(ls, r); put_resource(ls, r); } static int remove_waiter(const struct resource *r, const struct dlm_plock_info *in) { struct lock_waiter *w; list_for_each_entry(w, &r->waiters, list) { if (w->info.nodeid == in->nodeid && w->info.fsid == in->fsid && w->info.number == in->number && w->info.owner == in->owner && w->info.pid == in->pid && w->info.start == in->start && w->info.end == in->end && w->info.ex == in->ex) { list_del(&w->list); free(w); return 0; } } return -ENOENT; } static void do_cancel(struct lockspace *ls, struct dlm_plock_info *in, struct resource *r) { int rv; rv = remove_waiter(r, in); if (in->nodeid == our_nodeid) write_result(in, rv); put_resource(ls, r); } static void do_unlock(struct lockspace *ls, struct dlm_plock_info *in, struct resource *r) { int rv; rv = unlock_internal(ls, r, in); if (in->flags & DLM_PLOCK_FL_CLOSE) { clear_waiters(ls, r, in); /* no replies for unlock-close ops */ goto skip_result; } if (in->nodeid == our_nodeid) write_result(in, rv); skip_result: do_waiters(ls, r); put_resource(ls, r); } /* we don't even get to this function if the getlk isn't from us */ static void do_get(struct lockspace *ls, struct dlm_plock_info *in, struct resource *r) { int rv; if (is_conflict(r, in, 1)) rv = 1; else rv = 0; write_result(in, rv); put_resource(ls, r); } static void save_message(struct lockspace *ls, struct dlm_header *hd, int len, int from, int type) { struct save_msg *sm; sm = malloc(sizeof(struct save_msg) + len); if (!sm) return; memset(sm, 0, sizeof(struct save_msg) + len); memcpy(&sm->buf, hd, len); sm->type = type; sm->len = len; sm->nodeid = from; log_plock(ls, "save %s from %d len %d", msg_name(type), from, len); list_add_tail(&sm->list, &ls->saved_messages); } static void __receive_plock(struct lockspace *ls, struct dlm_plock_info *in, int from, struct resource *r) { switch (in->optype) { case DLM_PLOCK_OP_LOCK: ls->last_plock_time = monotime(); do_lock(ls, in, r); break; case DLM_PLOCK_OP_CANCEL: ls->last_plock_time = monotime(); do_cancel(ls, in, r); break; case DLM_PLOCK_OP_UNLOCK: ls->last_plock_time = monotime(); do_unlock(ls, in, r); break; case DLM_PLOCK_OP_GET: do_get(ls, in, r); break; default: log_elock(ls, "receive_plock error from %d optype %d", from, in->optype); if (from == our_nodeid) write_result(in, -EINVAL); } } /* When ls members receive our options message (for our mount), one of them saves all plock state received to that point in a checkpoint and then sends us our journals message. We know to retrieve the plock state from the checkpoint when we receive our journals message. Any plocks messages that arrive between seeing our options message and our journals message needs to be saved and processed after we synchronize our plock state from the checkpoint. Any plock message received while we're mounting but before we set save_plocks (when we see our options message) can be ignored because it should be reflected in the checkpointed state. */ static void _receive_plock(struct lockspace *ls, struct dlm_header *hd, int len) { struct dlm_plock_info info; struct resource *r = NULL; struct timeval now; uint64_t usec; int from = hd->nodeid; int rv, create; memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info)); info_bswap_in(&info); log_plock(ls, "receive plock %llx %s %s %llx-%llx %d/%u/%llx w %d", (unsigned long long)info.number, op_str(info.optype), ex_str(info.optype, info.ex), (unsigned long long)info.start, (unsigned long long)info.end, info.nodeid, info.pid, (unsigned long long)info.owner, info.wait); plock_recv_count++; if (!(plock_recv_count % 1000)) { gettimeofday(&now, NULL); usec = dt_usec(&plock_recv_time, &now); log_plock(ls, "plock_recv_count %u time %.3f s", plock_recv_count, usec * 1.e-6); plock_recv_time = now; } if (info.optype == DLM_PLOCK_OP_GET && from != our_nodeid) return; if (from != hd->nodeid || from != info.nodeid) { log_elock(ls, "receive_plock error from %d header %d info %d", from, hd->nodeid, info.nodeid); return; } create = !opt(plock_ownership_ind); rv = find_resource(ls, info.number, create, &r); if (rv && opt(plock_ownership_ind)) { /* There must have been a race with a drop, so we need to ignore this plock op which will be resent. If we're the one who sent the plock, we need to send_own() and put it on the pending list to resend once the owner is established. */ log_plock(ls, "receive_plock from %d no r %llx", from, (unsigned long long)info.number); if (from != our_nodeid) return; rv = find_resource(ls, info.number, 1, &r); if (rv) return; send_own(ls, r, our_nodeid); save_pending_plock(ls, r, &info); return; } if (rv) { /* r not found, rv is -ENOENT, this shouldn't happen because process_plocks() creates a resource for every op */ log_elock(ls, "receive_plock error from %d no r %llx %d", from, (unsigned long long)info.number, rv); return; } /* The owner should almost always be 0 here, but other owners may be possible given odd combinations of races with drop. Odd races to worry about (some seem pretty improbable): - A sends drop, B sends plock, receive drop, receive plock. This is addressed above. - A sends drop, B sends plock, receive drop, B reads plock and sends own, receive plock, on B we find owner of -1. - A sends drop, B sends two plocks, receive drop, receive plocks. Receiving the first plock is the previous case, receiving the second plock will find r with owner of -1. - A sends drop, B sends two plocks, receive drop, C sends own, receive plock, B sends own, receive own (C), receive plock, receive own (B). Haven't tried to cook up a scenario that would lead to the last case below; receiving a plock from ourself and finding we're the owner of r. */ if (!r->owner) { __receive_plock(ls, &info, from, r); } else if (r->owner == -1) { log_plock(ls, "receive_plock from %d r %llx owner %d", from, (unsigned long long)info.number, r->owner); if (from == our_nodeid) save_pending_plock(ls, r, &info); } else if (r->owner != our_nodeid) { log_plock(ls, "receive_plock from %d r %llx owner %d", from, (unsigned long long)info.number, r->owner); if (from == our_nodeid) save_pending_plock(ls, r, &info); } else if (r->owner == our_nodeid) { log_plock(ls, "receive_plock from %d r %llx owner %d", from, (unsigned long long)info.number, r->owner); if (from == our_nodeid) __receive_plock(ls, &info, from, r); } } void receive_plock(struct lockspace *ls, struct dlm_header *hd, int len) { if (ls->save_plocks) { save_message(ls, hd, len, hd->nodeid, DLM_MSG_PLOCK); return; } _receive_plock(ls, hd, len); } static int send_struct_info(struct lockspace *ls, struct dlm_plock_info *in, int msg_type) { struct dlm_header *hd; int rv = 0, len; char *buf; len = sizeof(struct dlm_header) + sizeof(struct dlm_plock_info); buf = malloc(len); if (!buf) { rv = -ENOMEM; goto out; } memset(buf, 0, len); info_bswap_out(in); hd = (struct dlm_header *)buf; hd->type = msg_type; memcpy(buf + sizeof(struct dlm_header), in, sizeof(*in)); dlm_send_message(ls, buf, len); free(buf); out: if (rv) log_elock(ls, "send_struct_info error %d", rv); return rv; } static void send_plock(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { send_struct_info(ls, in, DLM_MSG_PLOCK); } static void send_own(struct lockspace *ls, struct resource *r, int owner) { struct dlm_plock_info info; /* if we've already sent an own message for this resource, (pending list is not empty), then we shouldn't send another */ if (!list_empty(&r->pending)) { log_plock(ls, "send_own %llx already pending", (unsigned long long)r->number); return; } if (!owner) r->flags |= R_SEND_UNOWN; else r->flags |= R_SEND_OWN; memset(&info, 0, sizeof(info)); info.number = r->number; info.nodeid = owner; send_struct_info(ls, &info, DLM_MSG_PLOCK_OWN); } static void send_syncs(struct lockspace *ls, struct resource *r) { struct dlm_plock_info info; struct posix_lock *po; struct lock_waiter *w; int rv; list_for_each_entry(po, &r->locks, list) { memset(&info, 0, sizeof(info)); info.number = r->number; info.start = po->start; info.end = po->end; info.nodeid = po->nodeid; info.owner = po->owner; info.pid = po->pid; info.ex = po->ex; rv = send_struct_info(ls, &info, DLM_MSG_PLOCK_SYNC_LOCK); if (rv) goto out; po->flags |= P_SYNCING; } list_for_each_entry(w, &r->waiters, list) { memcpy(&info, &w->info, sizeof(info)); rv = send_struct_info(ls, &info, DLM_MSG_PLOCK_SYNC_WAITER); if (rv) goto out; w->flags |= P_SYNCING; } out: return; } static void send_drop(struct lockspace *ls, struct resource *r) { struct dlm_plock_info info; memset(&info, 0, sizeof(info)); info.number = r->number; r->flags |= R_SEND_DROP; send_struct_info(ls, &info, DLM_MSG_PLOCK_DROP); } /* plock op can't be handled until we know the owner value of the resource, so the op is saved on the pending list until the r owner is established */ static void save_pending_plock(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct lock_waiter *w; w = malloc(sizeof(struct lock_waiter)); if (!w) { log_elock(ls, "save_pending_plock no mem"); return; } memcpy(&w->info, in, sizeof(struct dlm_plock_info)); w->flags = 0; list_add_tail(&w->list, &r->pending); } /* plock ops are on pending list waiting for ownership to be established. owner has now become us, so add these plocks to r */ static void add_pending_plocks(struct lockspace *ls, struct resource *r) { struct lock_waiter *w, *safe; list_for_each_entry_safe(w, safe, &r->pending, list) { __receive_plock(ls, &w->info, our_nodeid, r); list_del(&w->list); free(w); } } /* plock ops are on pending list waiting for ownership to be established. owner has now become 0, so send these plocks to everyone */ static void send_pending_plocks(struct lockspace *ls, struct resource *r) { struct lock_waiter *w, *safe; list_for_each_entry_safe(w, safe, &r->pending, list) { send_plock(ls, r, &w->info); list_del(&w->list); free(w); } } static void _receive_own(struct lockspace *ls, struct dlm_header *hd, int len) { struct dlm_plock_info info; struct resource *r; int should_not_happen = 0; int from = hd->nodeid; int rv; ls->last_plock_time = monotime(); memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info)); info_bswap_in(&info); log_plock(ls, "receive_own %llx from %u owner %u", (unsigned long long)info.number, hd->nodeid, info.nodeid); rv = find_resource(ls, info.number, 1, &r); if (rv) return; if (from == our_nodeid) { /* * received our own own message */ if (info.nodeid == 0) { /* we are setting owner to 0 */ if (r->owner == our_nodeid) { /* we set owner to 0 when we relinquish ownership */ should_not_happen = 1; } else if (r->owner == 0) { /* this happens when we relinquish ownership */ r->flags |= R_GOT_UNOWN; } else { should_not_happen = 1; } } else if (info.nodeid == our_nodeid) { /* we are setting owner to ourself */ if (r->owner == -1) { /* we have gained ownership */ r->owner = our_nodeid; add_pending_plocks(ls, r); } else if (r->owner == our_nodeid) { should_not_happen = 1; } else if (r->owner == 0) { send_pending_plocks(ls, r); } else { /* resource is owned by other node; they should set owner to 0 shortly */ } } else { /* we should only ever set owner to 0 or ourself */ should_not_happen = 1; } } else { /* * received own message from another node */ if (info.nodeid == 0) { /* other node is setting owner to 0 */ if (r->owner == -1) { /* we should have a record of the owner before it relinquishes */ should_not_happen = 1; } else if (r->owner == our_nodeid) { /* only the owner should relinquish */ should_not_happen = 1; } else if (r->owner == 0) { should_not_happen = 1; } else { r->owner = 0; r->flags |= R_GOT_UNOWN; send_pending_plocks(ls, r); } } else if (info.nodeid == from) { /* other node is setting owner to itself */ if (r->owner == -1) { /* normal path for a node becoming owner */ r->owner = from; } else if (r->owner == our_nodeid) { /* we relinquish our ownership: sync our local plocks to everyone, then set owner to 0 */ send_syncs(ls, r); send_own(ls, r, 0); /* we need to set owner to 0 here because local ops may arrive before we receive our send_own message and can't be added locally */ r->owner = 0; } else if (r->owner == 0) { /* can happen because we set owner to 0 before we receive our send_own sent just above */ } else { /* do nothing, current owner should be relinquishing its ownership */ } } else if (info.nodeid == our_nodeid) { /* no one else should try to set the owner to us */ should_not_happen = 1; } else { /* a node should only ever set owner to 0 or itself */ should_not_happen = 1; } } if (should_not_happen) { log_elock(ls, "receive_own error from %u %llx " "info nodeid %d r owner %d", from, (unsigned long long)r->number, info.nodeid, r->owner); } } void receive_own(struct lockspace *ls, struct dlm_header *hd, int len) { if (ls->save_plocks) { save_message(ls, hd, len, hd->nodeid, DLM_MSG_PLOCK_OWN); return; } _receive_own(ls, hd, len); } static void clear_syncing_flag(struct lockspace *ls, struct resource *r, struct dlm_plock_info *in) { struct posix_lock *po; struct lock_waiter *w; list_for_each_entry(po, &r->locks, list) { if ((po->flags & P_SYNCING) && in->start == po->start && in->end == po->end && in->nodeid == po->nodeid && in->owner == po->owner && in->pid == po->pid && in->ex == po->ex) { po->flags &= ~P_SYNCING; return; } } list_for_each_entry(w, &r->waiters, list) { if ((w->flags & P_SYNCING) && in->start == w->info.start && in->end == w->info.end && in->nodeid == w->info.nodeid && in->owner == w->info.owner && in->pid == w->info.pid && in->ex == w->info.ex) { w->flags &= ~P_SYNCING; return; } } log_elock(ls, "clear_syncing error %llx no match %s %llx-%llx %d/%u/%llx", (unsigned long long)r->number, in->ex ? "WR" : "RD", (unsigned long long)in->start, (unsigned long long)in->end, in->nodeid, in->pid, (unsigned long long)in->owner); } static void _receive_sync(struct lockspace *ls, struct dlm_header *hd, int len) { struct dlm_plock_info info; struct resource *r; int from = hd->nodeid; int rv; ls->last_plock_time = monotime(); memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info)); info_bswap_in(&info); log_plock(ls, "receive sync %llx from %u %s %llx-%llx %d/%u/%llx", (unsigned long long)info.number, from, info.ex ? "WR" : "RD", (unsigned long long)info.start, (unsigned long long)info.end, info.nodeid, info.pid, (unsigned long long)info.owner); rv = find_resource(ls, info.number, 0, &r); if (rv) { log_elock(ls, "receive_sync error no r %llx from %d", info.number, from); return; } if (from == our_nodeid) { /* this plock now in sync on all nodes */ clear_syncing_flag(ls, r, &info); return; } if (hd->type == DLM_MSG_PLOCK_SYNC_LOCK) add_lock(r, info.nodeid, info.owner, info.pid, info.ex, info.start, info.end); else if (hd->type == DLM_MSG_PLOCK_SYNC_WAITER) add_waiter(ls, r, &info); } void receive_sync(struct lockspace *ls, struct dlm_header *hd, int len) { if (ls->save_plocks) { save_message(ls, hd, len, hd->nodeid, hd->type); return; } _receive_sync(ls, hd, len); } static void _receive_drop(struct lockspace *ls, struct dlm_header *hd, int len) { struct dlm_plock_info info; struct resource *r; int from = hd->nodeid; int rv; ls->last_plock_time = monotime(); memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info)); info_bswap_in(&info); log_plock(ls, "receive_drop %llx from %u", (unsigned long long)info.number, from); rv = find_resource(ls, info.number, 0, &r); if (rv) { /* we'll find no r if two nodes sent drop at once */ log_plock(ls, "receive_drop from %d no r %llx", from, (unsigned long long)info.number); return; } if (r->owner != 0) { /* - A sent drop, B sent drop, receive drop A, C sent own, receive drop B (this warning on C, owner -1) - A sent drop, B sent drop, receive drop A, A sent own, receive own A, receive drop B (this warning on all, owner A) */ log_plock(ls, "receive_drop from %d r %llx owner %d", from, (unsigned long long)r->number, r->owner); return; } if (!list_empty(&r->pending)) { /* shouldn't happen */ log_elock(ls, "receive_drop error from %d r %llx pending op", from, (unsigned long long)r->number); return; } /* the decision to drop or not must be based on things that are guaranteed to be the same on all nodes */ if (list_empty(&r->locks) && list_empty(&r->waiters)) { rb_del_plock_resource(ls, r); list_del(&r->list); free(r); } else { /* A sent drop, B sent a plock, receive plock, receive drop */ log_plock(ls, "receive_drop from %d r %llx in use", from, (unsigned long long)r->number); } } void receive_drop(struct lockspace *ls, struct dlm_header *hd, int len) { if (ls->save_plocks) { save_message(ls, hd, len, hd->nodeid, DLM_MSG_PLOCK_DROP); return; } _receive_drop(ls, hd, len); } /* We only drop resources from the unowned state to simplify things. If we want to drop a resource we own, we unown/relinquish it first. */ /* FIXME: in the transition from owner = us, to owner = 0, to drop; we want the second period to be shorter than the first */ static int drop_resources(struct lockspace *ls) { struct resource *r; struct timeval now; int count = 0; if (!opt(plock_ownership_ind)) return 0; if (list_empty(&ls->plock_resources)) return 0; gettimeofday(&now, NULL); if (time_diff_ms(&ls->drop_resources_last, &now) < opt(drop_resources_time_ind)) return 1; ls->drop_resources_last = now; /* try to drop the oldest, unused resources */ list_for_each_entry_reverse(r, &ls->plock_resources, list) { if (count >= opt(drop_resources_count_ind)) break; if (r->owner && r->owner != our_nodeid) continue; if (time_diff_ms(&r->last_access, &now) < opt(drop_resources_age_ind)) continue; if (list_empty(&r->locks) && list_empty(&r->waiters)) { if (r->owner == our_nodeid) { send_own(ls, r, 0); r->owner = 0; } else if (r->owner == 0 && got_unown(r)) { send_drop(ls, r); } count++; } } return 1; } void drop_resources_all(void) { struct lockspace *ls; int rv = 0; poll_drop_plock = 0; list_for_each_entry(ls, &lockspaces, list) { rv = drop_resources(ls); if (rv) poll_drop_plock = 1; } } int limit_plocks(void) { struct timeval now; if (!opt(plock_rate_limit_ind) || !plock_read_count) return 0; gettimeofday(&now, NULL); /* Every time a plock op is read from the kernel, we increment plock_read_count. After every plock_rate_limit (N) reads, we check the time it's taken to do those N; if the time is less than a second, then we delay reading any more until a second is up. This way we read a max of N ops from the kernel every second. */ if (!(plock_read_count % opt(plock_rate_limit_ind))) { if (time_diff_ms(&plock_rate_last, &now) < 1000) { plock_rate_delays++; return 2; } plock_rate_last = now; plock_read_count++; } return 0; } void process_plocks(int ci) { struct lockspace *ls; struct resource *r; struct dlm_plock_info info; struct timeval now; uint64_t usec; int create, rv; if (limit_plocks()) { poll_ignore_plock = 1; client_ignore(plock_ci, plock_fd); return; } gettimeofday(&now, NULL); rv = do_read(plock_device_fd, &info, sizeof(info)); if (rv < 0) { log_debug("process_plocks: read error %d fd %d\n", errno, plock_device_fd); return; } /* kernel doesn't set the nodeid field */ info.nodeid = our_nodeid; if (!opt(enable_plock_ind)) { rv = -ENOSYS; goto fail; } ls = find_ls_id(info.fsid); if (!ls) { log_plock(ls, "process_plocks: no ls id %x", info.fsid); rv = -EEXIST; goto fail; } if (ls->disable_plock) { rv = -ENOSYS; goto fail; } log_plock(ls, "read plock %llx %s %s %llx-%llx %d/%u/%llx w %d", (unsigned long long)info.number, op_str(info.optype), ex_str(info.optype, info.ex), (unsigned long long)info.start, (unsigned long long)info.end, info.nodeid, info.pid, (unsigned long long)info.owner, info.wait); /* report plock rate and any delays since the last report */ plock_read_count++; if (!(plock_read_count % 1000)) { usec = dt_usec(&plock_read_time, &now) ; log_plock(ls, "plock_read_count %u time %.3f s delays %u", plock_read_count, usec * 1.e-6, plock_rate_delays); plock_read_time = now; plock_rate_delays = 0; } create = (info.optype == DLM_PLOCK_OP_UNLOCK) ? 0 : 1; rv = find_resource(ls, info.number, create, &r); if (rv) goto fail; if (r->owner == 0) { /* plock state replicated on all nodes */ send_plock(ls, r, &info); } else if (r->owner == our_nodeid) { /* we are the owner of r, so our plocks are local */ __receive_plock(ls, &info, our_nodeid, r); } else { /* r owner is -1: r is new, try to become the owner; r owner > 0: tell other owner to give up ownership; both done with a message trying to set owner to ourself */ send_own(ls, r, our_nodeid); save_pending_plock(ls, r, &info); } if (opt(plock_ownership_ind) && !list_empty(&ls->plock_resources)) poll_drop_plock = 1; return; fail: if (!(info.flags & DLM_PLOCK_FL_CLOSE)) write_result(&info, rv); } void process_saved_plocks(struct lockspace *ls) { struct save_msg *sm, *sm2; struct dlm_header *hd; int count = 0; log_plock(ls, "process_saved_plocks begin"); if (list_empty(&ls->saved_messages)) goto out; list_for_each_entry_safe(sm, sm2, &ls->saved_messages, list) { hd = (struct dlm_header *)sm->buf; switch (sm->type) { case DLM_MSG_PLOCK: _receive_plock(ls, hd, sm->len); break; case DLM_MSG_PLOCK_OWN: _receive_own(ls, hd, sm->len); break; case DLM_MSG_PLOCK_DROP: _receive_drop(ls, hd, sm->len); break; case DLM_MSG_PLOCK_SYNC_LOCK: case DLM_MSG_PLOCK_SYNC_WAITER: _receive_sync(ls, hd, sm->len); break; default: continue; } list_del(&sm->list); free(sm); count++; } out: log_plock(ls, "process_saved_plocks %d done", count); } /* locks still marked SYNCING should not go into the ckpt; the new node will get those locks by receiving PLOCK_SYNC messages */ #define MAX_SEND_SIZE 1024 /* 1024 holds 24 plock_data */ static char send_buf[MAX_SEND_SIZE]; static int pack_send_buf(struct lockspace *ls, struct resource *r, int owner, int full, int *count_out, void **last) { struct resource_data *rd; struct plock_data *pp; struct posix_lock *po; struct lock_waiter *w; int count = 0; int find = 0; int len; /* N.B. owner not always equal to r->owner */ rd = (struct resource_data *)(send_buf + sizeof(struct dlm_header)); rd->number = cpu_to_le64(r->number); rd->owner = cpu_to_le32(owner); if (full) { rd->flags = RD_CONTINUE; find = 1; } /* plocks not replicated for owned resources */ if (opt(plock_ownership_ind) && (owner == our_nodeid)) goto done; len = sizeof(struct dlm_header) + sizeof(struct resource_data); pp = (struct plock_data *)(send_buf + sizeof(struct dlm_header) + sizeof(struct resource_data)); list_for_each_entry(po, &r->locks, list) { if (find && *last != po) continue; find = 0; if (po->flags & P_SYNCING) continue; if (len + sizeof(struct plock_data) > sizeof(send_buf)) { *last = po; goto full; } len += sizeof(struct plock_data); pp->start = cpu_to_le64(po->start); pp->end = cpu_to_le64(po->end); pp->owner = cpu_to_le64(po->owner); pp->pid = cpu_to_le32(po->pid); pp->nodeid = cpu_to_le32(po->nodeid); pp->ex = po->ex; pp->waiter = 0; pp++; count++; } list_for_each_entry(w, &r->waiters, list) { if (find && *last != w) continue; find = 0; if (w->flags & P_SYNCING) continue; if (len + sizeof(struct plock_data) > sizeof(send_buf)) { *last = w; goto full; } len += sizeof(struct plock_data); pp->start = cpu_to_le64(w->info.start); pp->end = cpu_to_le64(w->info.end); pp->owner = cpu_to_le64(w->info.owner); pp->pid = cpu_to_le32(w->info.pid); pp->nodeid = cpu_to_le32(w->info.nodeid); pp->ex = w->info.ex; pp->waiter = 1; pp++; count++; } done: rd->lock_count = cpu_to_le32(count); *count_out = count; *last = NULL; return 0; full: rd->lock_count = cpu_to_le32(count); *count_out = count; return 1; } /* Copy all plock state into a checkpoint so new node can retrieve it. The node creating the ckpt for the mounter needs to be the same node that's sending the mounter its journals message (i.e. the low nodeid). The new mounter knows the ckpt is ready to read only after it gets its journals message. If the mounter is becoming the new low nodeid in the group, the node doing the store closes the ckpt and the new node unlinks the ckpt after reading it. The ckpt should then disappear and the new node can create a new ckpt for the next mounter. */ static int send_plocks_data(struct lockspace *ls, uint32_t seq, char *buf, int len) { struct dlm_header *hd; hd = (struct dlm_header *)buf; hd->type = DLM_MSG_PLOCKS_DATA; hd->msgdata = seq; dlm_send_message(ls, buf, len); return 0; } void send_all_plocks_data(struct lockspace *ls, uint32_t seq, uint32_t *plocks_data) { struct resource *r; void *last; int owner, count, len, full; uint32_t send_count = 0; if (!opt(enable_plock_ind) || ls->disable_plock) return; log_dlock(ls, "send_all_plocks_data %d:%u", our_nodeid, seq); /* - If r owner is -1, ckpt nothing. - If r owner is us, ckpt owner of us and no plocks. - If r owner is other, ckpt that owner and any plocks we have on r (they've just been synced but owner=0 msg not recved yet). - If r owner is 0 and !got_unown, then we've just unowned r; ckpt owner of us and any plocks that don't have SYNCING set (plocks with SYNCING will be handled by our sync messages). - If r owner is 0 and got_unown, then ckpt owner 0 and all plocks; (there should be no SYNCING plocks) */ list_for_each_entry(r, &ls->plock_resources, list) { if (!opt(plock_ownership_ind)) owner = 0; else if (r->owner == -1) continue; else if (r->owner == our_nodeid) owner = our_nodeid; else if (r->owner) owner = r->owner; else if (!r->owner && !got_unown(r)) owner = our_nodeid; else if (!r->owner) owner = 0; else { log_elock(ls, "send_all_plocks_data error owner %d r %llx", r->owner, (unsigned long long)r->number); continue; } memset(&send_buf, 0, sizeof(send_buf)); count = 0; full = 0; last = NULL; do { full = pack_send_buf(ls, r, owner, full, &count, &last); len = sizeof(struct dlm_header) + sizeof(struct resource_data) + sizeof(struct plock_data) * count; log_plock(ls, "send_plocks_data %d:%u n %llu o %d locks %d len %d", our_nodeid, seq, (unsigned long long)r->number, r->owner, count, len); send_plocks_data(ls, seq, send_buf, len); send_count++; } while (full); } *plocks_data = send_count; log_dlock(ls, "send_all_plocks_data %d:%u %u done", our_nodeid, seq, send_count); } static void free_r_lists(struct resource *r) { struct posix_lock *po, *po2; struct lock_waiter *w, *w2; list_for_each_entry_safe(po, po2, &r->locks, list) { list_del(&po->list); free(po); } list_for_each_entry_safe(w, w2, &r->waiters, list) { list_del(&w->list); free(w); } } void receive_plocks_data(struct lockspace *ls, struct dlm_header *hd, int len) { struct resource_data *rd; struct plock_data *pp; struct posix_lock *po; struct lock_waiter *w; struct resource *r; uint64_t num; uint32_t count; uint32_t flags; int owner; int i; if (!opt(enable_plock_ind) || ls->disable_plock) return; if (!ls->need_plocks) return; if (!ls->save_plocks) return; ls->recv_plocks_data_count++; if (len < sizeof(struct dlm_header) + sizeof(struct resource_data)) { log_elock(ls, "recv_plocks_data %d:%u bad len %d", hd->nodeid, hd->msgdata, len); return; } rd = (struct resource_data *)((char *)hd + sizeof(struct dlm_header)); num = le64_to_cpu(rd->number); owner = le32_to_cpu(rd->owner); count = le32_to_cpu(rd->lock_count); flags = le32_to_cpu(rd->flags); if (flags & RD_CONTINUE) { r = search_resource(ls, num); if (!r) { log_elock(ls, "recv_plocks_data %d:%u n %llu not found", hd->nodeid, hd->msgdata, (unsigned long long)num); return; } log_plock(ls, "recv_plocks_data %d:%u n %llu continue", hd->nodeid, hd->msgdata, (unsigned long long)num); goto unpack; } r = malloc(sizeof(struct resource)); if (!r) { log_elock(ls, "recv_plocks_data %d:%u n %llu no mem", hd->nodeid, hd->msgdata, (unsigned long long)num); return; } memset(r, 0, sizeof(struct resource)); INIT_LIST_HEAD(&r->locks); INIT_LIST_HEAD(&r->waiters); INIT_LIST_HEAD(&r->pending); if (!opt(plock_ownership_ind)) { if (owner) { log_elock(ls, "recv_plocks_data %d:%u n %llu bad owner %d", hd->nodeid, hd->msgdata, (unsigned long long)num, owner); goto fail_free; } } else { if (!owner) r->flags |= R_GOT_UNOWN; /* no locks should be included for owned resources */ if (owner && count) { log_elock(ls, "recv_plocks_data %d:%u n %llu o %d bad count %" PRIu32, hd->nodeid, hd->msgdata, (unsigned long long)num, owner, count); goto fail_free; } } r->number = num; r->owner = owner; unpack: if (len < sizeof(struct dlm_header) + sizeof(struct resource_data) + sizeof(struct plock_data) * count) { log_elock(ls, "recv_plocks_data %d:%u count %u bad len %d", hd->nodeid, hd->msgdata, count, len); goto fail_free; } pp = (struct plock_data *)((char *)rd + sizeof(struct resource_data)); for (i = 0; i < count; i++) { if (!pp->waiter) { po = malloc(sizeof(struct posix_lock)); if (!po) goto fail_free; po->start = le64_to_cpu(pp->start); po->end = le64_to_cpu(pp->end); po->owner = le64_to_cpu(pp->owner); po->pid = le32_to_cpu(pp->pid); po->nodeid = le32_to_cpu(pp->nodeid); po->ex = pp->ex; po->flags = 0; list_add_tail(&po->list, &r->locks); } else { w = malloc(sizeof(struct lock_waiter)); if (!w) goto fail_free; w->info.start = le64_to_cpu(pp->start); w->info.end = le64_to_cpu(pp->end); w->info.owner = le64_to_cpu(pp->owner); w->info.pid = le32_to_cpu(pp->pid); w->info.nodeid = le32_to_cpu(pp->nodeid); w->info.ex = pp->ex; w->flags = 0; list_add_tail(&w->list, &r->waiters); } pp++; } log_plock(ls, "recv_plocks_data %d:%u n %llu o %d locks %d len %d", hd->nodeid, hd->msgdata, (unsigned long long)r->number, r->owner, count, len); if (!(flags & RD_CONTINUE)) { list_add_tail(&r->list, &ls->plock_resources); rb_insert_plock_resource(ls, r); } return; fail_free: if (!(flags & RD_CONTINUE)) { free_r_lists(r); free(r); } return; } void clear_plocks_data(struct lockspace *ls) { struct resource *r, *r2; uint32_t count = 0; if (!opt(enable_plock_ind) || ls->disable_plock) return; list_for_each_entry_safe(r, r2, &ls->plock_resources, list) { free_r_lists(r); rb_del_plock_resource(ls, r); list_del(&r->list); free(r); count++; } log_dlock(ls, "clear_plocks_data done %u recv_plocks_data_count %u", count, ls->recv_plocks_data_count); ls->recv_plocks_data_count = 0; } /* Called when a node has failed, or we're unmounting. For a node failure, we need to call this when the cpg confchg arrives so that we're guaranteed all nodes do this in the same sequence wrt other messages. */ void purge_plocks(struct lockspace *ls, int nodeid, int unmount) { struct posix_lock *po, *po2; struct lock_waiter *w, *w2; struct resource *r, *r2; int purged = 0; if (!opt(enable_plock_ind) || ls->disable_plock) return; list_for_each_entry_safe(r, r2, &ls->plock_resources, list) { list_for_each_entry_safe(po, po2, &r->locks, list) { if (po->nodeid == nodeid || unmount) { list_del(&po->list); free(po); purged++; } } list_for_each_entry_safe(w, w2, &r->waiters, list) { if (w->info.nodeid == nodeid || unmount) { list_del(&w->list); free(w); purged++; } } /* TODO: haven't thought carefully about how this transition to owner 0 might interact with other owner messages in progress. */ if (r->owner == nodeid) { r->owner = 0; r->flags |= R_GOT_UNOWN; r->flags |= R_PURGE_UNOWN; send_pending_plocks(ls, r); } do_waiters(ls, r); if (!opt(plock_ownership_ind) && list_empty(&r->locks) && list_empty(&r->waiters)) { rb_del_plock_resource(ls, r); list_del(&r->list); free(r); } } if (purged) ls->last_plock_time = monotime(); log_dlock(ls, "purged %d plocks for %d", purged, nodeid); } int copy_plock_state(struct lockspace *ls, char *buf, int *len_out) { struct posix_lock *po; struct lock_waiter *w; struct resource *r; struct timeval now; int rv = 0; int len = DLMC_DUMP_SIZE, pos = 0, ret; gettimeofday(&now, NULL); list_for_each_entry(r, &ls->plock_resources, list) { if (list_empty(&r->locks) && list_empty(&r->waiters) && list_empty(&r->pending)) { ret = snprintf(buf + pos, len - pos, "%llu rown %d unused_ms %llu\n", (unsigned long long)r->number, r->owner, (unsigned long long)time_diff_ms(&r->last_access, &now)); if (ret >= len - pos) { rv = -ENOSPC; goto out; } pos += ret; continue; } list_for_each_entry(po, &r->locks, list) { ret = snprintf(buf + pos, len - pos, "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d\n", (unsigned long long)r->number, po->ex ? "WR" : "RD", (unsigned long long)po->start, (unsigned long long)po->end, po->nodeid, po->pid, (unsigned long long)po->owner, r->owner); if (ret >= len - pos) { rv = -ENOSPC; goto out; } pos += ret; } list_for_each_entry(w, &r->waiters, list) { ret = snprintf(buf + pos, len - pos, "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d WAITING\n", (unsigned long long)r->number, w->info.ex ? "WR" : "RD", (unsigned long long)w->info.start, (unsigned long long)w->info.end, w->info.nodeid, w->info.pid, (unsigned long long)w->info.owner, r->owner); if (ret >= len - pos) { rv = -ENOSPC; goto out; } pos += ret; } list_for_each_entry(w, &r->pending, list) { ret = snprintf(buf + pos, len - pos, "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d PENDING\n", (unsigned long long)r->number, w->info.ex ? "WR" : "RD", (unsigned long long)w->info.start, (unsigned long long)w->info.end, w->info.nodeid, w->info.pid, (unsigned long long)w->info.owner, r->owner); if (ret >= len - pos) { rv = -ENOSPC; goto out; } pos += ret; } } out: *len_out = pos; return rv; } dlm-4.3.0/dlm_controld/rbtree.c000066400000000000000000000400501461150666200164070ustar00rootroot00000000000000/* Copied from linux/lib/rbtree.c */ /* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse (C) 2012 Michel Lespinasse linux/lib/rbtree.c */ #include "rbtree_augmented.h" /* * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree * * 1) A node is either red or black * 2) The root is black * 3) All leaves (NULL) are black * 4) Both children of every red node are black * 5) Every simple path from root to leaves contains the same number * of black nodes. * * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two * consecutive red nodes in a path and every red node is therefore followed by * a black. So if B is the number of black nodes on every simple path (as per * 5), then the longest possible path due to 4 is 2B. * * We shall indicate color with case, where black nodes are uppercase and red * nodes will be lowercase. Unknown color nodes shall be drawn as red within * parentheses and have some accompanying text comment. */ /* * Notes on lockless lookups: * * All stores to the tree structure (rb_left and rb_right) must be done using * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the * tree structure as seen in program order. * * These two requirements will allow lockless iteration of the tree -- not * correct iteration mind you, tree rotations are not atomic so a lookup might * miss entire subtrees. * * But they do guarantee that any such traversal will only see valid elements * and that it will indeed complete -- does not get stuck in a loop. * * It also guarantees that if the lookup returns an element it is the 'correct' * one. But not returning an element does _NOT_ mean it's not present. * * NOTE: * * Stores to __rb_parent_color are not important for simple lookups so those * are left undone as of now. Nor did I check for loops involving parent * pointers. */ static inline void rb_set_black(struct rb_node *rb) { rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) { return (struct rb_node *)red->__rb_parent_color; } /* * Helper function for rotations: * - old's parent and color get assigned to new * - old gets assigned new as a parent and 'color' as a color. */ static inline void __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, struct rb_root *root, int color) { struct rb_node *parent = rb_parent(old); new->__rb_parent_color = old->__rb_parent_color; rb_set_parent_color(old, new, color); __rb_change_child(old, new, parent, root); } static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; while (true) { /* * Loop invariant: node is red. */ if (!parent) { /* * The inserted node is root. Either this is the * first node, or we recursed at Case 1 below and * are no longer violating 4). */ rb_set_parent_color(node, NULL, RB_BLACK); break; } /* * If there is a black parent, we are done. * Otherwise, take some corrective action as, * per 4), we don't want a red root or two * consecutive red nodes. */ if(rb_is_black(parent)) break; gparent = rb_red_parent(parent); tmp = gparent->rb_right; if (parent != tmp) { /* parent == gparent->rb_left */ if (tmp && rb_is_red(tmp)) { /* * Case 1 - node's uncle is red (color flips). * * G g * / \ / \ * p u --> P U * / / * n n * * However, since g's parent might be red, and * 4) does not allow this, we need to recurse * at g. */ rb_set_parent_color(tmp, gparent, RB_BLACK); rb_set_parent_color(parent, gparent, RB_BLACK); node = gparent; parent = rb_parent(node); rb_set_parent_color(node, parent, RB_RED); continue; } tmp = parent->rb_right; if (node == tmp) { /* * Case 2 - node's uncle is black and node is * the parent's right child (left rotate at parent). * * G G * / \ / \ * p U --> n U * \ / * n p * * This still leaves us in violation of 4), the * continuation into Case 3 will fix that. */ tmp = node->rb_left; WRITE_ONCE(parent->rb_right, tmp); WRITE_ONCE(node->rb_left, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); rb_set_parent_color(parent, node, RB_RED); augment_rotate(parent, node); parent = node; tmp = node->rb_right; } /* * Case 3 - node's uncle is black and node is * the parent's left child (right rotate at gparent). * * G P * / \ / \ * p U --> n g * / \ * n U */ WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ WRITE_ONCE(parent->rb_right, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); augment_rotate(gparent, parent); break; } else { tmp = gparent->rb_left; if (tmp && rb_is_red(tmp)) { /* Case 1 - color flips */ rb_set_parent_color(tmp, gparent, RB_BLACK); rb_set_parent_color(parent, gparent, RB_BLACK); node = gparent; parent = rb_parent(node); rb_set_parent_color(node, parent, RB_RED); continue; } tmp = parent->rb_left; if (node == tmp) { /* Case 2 - right rotate at parent */ tmp = node->rb_right; WRITE_ONCE(parent->rb_left, tmp); WRITE_ONCE(node->rb_right, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); rb_set_parent_color(parent, node, RB_RED); augment_rotate(parent, node); parent = node; tmp = node->rb_left; } /* Case 3 - left rotate at gparent */ WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ WRITE_ONCE(parent->rb_left, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); augment_rotate(gparent, parent); break; } } } /* * Inline version for rb_erase() use - we want to be able to inline * and eliminate the dummy_rotate callback there */ static __always_inline void ____rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; while (true) { /* * Loop invariants: * - node is black (or NULL on first iteration) * - node is not the root (parent is not NULL) * - All leaf paths going through parent and node have a * black node count that is 1 lower than other leaf paths. */ sibling = parent->rb_right; if (node != sibling) { /* node == parent->rb_left */ if (rb_is_red(sibling)) { /* * Case 1 - left rotate at parent * * P S * / \ / \ * N s --> p Sr * / \ / \ * Sl Sr N Sl */ tmp1 = sibling->rb_left; WRITE_ONCE(parent->rb_right, tmp1); WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_right; if (!tmp1 || rb_is_black(tmp1)) { tmp2 = sibling->rb_left; if (!tmp2 || rb_is_black(tmp2)) { /* * Case 2 - sibling color flip * (p could be either color here) * * (p) (p) * / \ / \ * N S --> N s * / \ / \ * Sl Sr Sl Sr * * This leaves us violating 5) which * can be fixed by flipping p to black * if it was red, or by recursing at p. * p is red when coming from Case 1. */ rb_set_parent_color(sibling, parent, RB_RED); if (rb_is_red(parent)) rb_set_black(parent); else { node = parent; parent = rb_parent(node); if (parent) continue; } break; } /* * Case 3 - right rotate at sibling * (p could be either color here) * * (p) (p) * / \ / \ * N S --> N sl * / \ \ * sl Sr S * \ * Sr * * Note: p might be red, and then both * p and sl are red after rotation(which * breaks property 4). This is fixed in * Case 4 (in __rb_rotate_set_parents() * which set sl the color of p * and set p RB_BLACK) * * (p) (sl) * / \ / \ * N sl --> P S * \ / \ * S N Sr * \ * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); WRITE_ONCE(tmp2->rb_right, sibling); WRITE_ONCE(parent->rb_right, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } /* * Case 4 - left rotate at parent + color flips * (p and sl could be either color here. * After rotation, p becomes black, s acquires * p's color, and sl keeps its color) * * (p) (s) * / \ / \ * N S --> P Sr * / \ / \ * (sl) sr N (sl) */ tmp2 = sibling->rb_left; WRITE_ONCE(parent->rb_right, tmp2); WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); augment_rotate(parent, sibling); break; } else { sibling = parent->rb_left; if (rb_is_red(sibling)) { /* Case 1 - right rotate at parent */ tmp1 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp1); WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_left; if (!tmp1 || rb_is_black(tmp1)) { tmp2 = sibling->rb_right; if (!tmp2 || rb_is_black(tmp2)) { /* Case 2 - sibling color flip */ rb_set_parent_color(sibling, parent, RB_RED); if (rb_is_red(parent)) rb_set_black(parent); else { node = parent; parent = rb_parent(node); if (parent) continue; } break; } /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); WRITE_ONCE(parent->rb_left, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); augment_rotate(parent, sibling); break; } } } /* Non-inline version for rb_erase_augmented() use */ void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { ____rb_erase_color(parent, root, augment_rotate); } /* * Non-augmented rbtree manipulation functions. * * We use dummy augmented callbacks here, and have the compiler optimize them * out of the rb_insert_color() and rb_erase() function definitions. */ static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} static const struct rb_augment_callbacks dummy_callbacks = { .propagate = dummy_propagate, .copy = dummy_copy, .rotate = dummy_rotate }; void rb_insert_color(struct rb_node *node, struct rb_root *root) { __rb_insert(node, root, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } /* * Augmented rbtree manipulation functions. * * This instantiates the same __always_inline functions as in the non-augmented * case, but this time with user-defined callbacks. */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { __rb_insert(node, root, augment_rotate); } /* * This function returns the first node (in sort order) of the tree. */ struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; if (RB_EMPTY_NODE(node)) return NULL; /* * If we have a right-hand child, go down and then left as far * as we can. */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) node = node->rb_left; return (struct rb_node *)node; } /* * No right-hand children. Everything down and left is smaller than us, * so any 'next' node must be in the general direction of our parent. * Go up the tree; any time the ancestor is a right-hand child of its * parent, keep going up. First time it's a left-hand child of its * parent, said parent is our 'next' node. */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; return parent; } struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; if (RB_EMPTY_NODE(node)) return NULL; /* * If we have a left-hand child, go down and then right as far * as we can. */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) node = node->rb_right; return (struct rb_node *)node; } /* * No left-hand children. Go up till we find an ancestor which * is a right-hand child of its parent. */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; return parent; } void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { struct rb_node *parent = rb_parent(victim); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; /* Set the surrounding nodes to point to the replacement */ if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); __rb_change_child(victim, new, parent, root); } static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { for (;;) { if (node->rb_left) node = node->rb_left; else if (node->rb_right) node = node->rb_right; else return (struct rb_node *)node; } } struct rb_node *rb_next_postorder(const struct rb_node *node) { const struct rb_node *parent; if (!node) return NULL; parent = rb_parent(node); /* If we're sitting on node, we've already seen our children */ if (parent && node == parent->rb_left && parent->rb_right) { /* If we are the parent's left node, go to the parent's right * node then all the way down to the left */ return rb_left_deepest_node(parent->rb_right); } else /* Otherwise we are the parent's right node, and the parent * should be next */ return (struct rb_node *)parent; } struct rb_node *rb_first_postorder(const struct rb_root *root) { if (!root->rb_node) return NULL; return rb_left_deepest_node(root->rb_node); } dlm-4.3.0/dlm_controld/rbtree.h000066400000000000000000000211101461150666200164100ustar00rootroot00000000000000/* Copied from linux/include/linux/rbtree.h */ /* Red Black Trees (C) 1999 Andrea Arcangeli linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include #include #include "linux_helpers.h" #include "rbtree_types.h" #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */ dlm-4.3.0/dlm_controld/rbtree_augmented.h000066400000000000000000000222641461150666200204540ustar00rootroot00000000000000/* Copied from linux/include/linux/rbtree_augmented.h */ /* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse (C) 2012 Michel Lespinasse linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include "rbtree.h" #include "linux_helpers.h" /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ dlm-4.3.0/dlm_controld/rbtree_types.h000066400000000000000000000016671461150666200176530ustar00rootroot00000000000000/* Copied from linux/include/linux/rbtree_types.h */ #ifndef _LINUX_RBTREE_TYPES_H #define _LINUX_RBTREE_TYPES_H struct rb_node { unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ struct rb_root { struct rb_node *rb_node; }; /* * Leftmost-cached rbtrees. * * We do not cache the rightmost node based on footprint * size vs number of potential users that could benefit * from O(1) rb_last(). Just not worth it, users that want * this feature can always implement the logic explicitly. * Furthermore, users that want to cache both pointers may * find it a bit asymmetric, but that's ok. */ struct rb_root_cached { struct rb_root rb_root; struct rb_node *rb_leftmost; }; #define RB_ROOT (struct rb_root) { NULL, } #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #endif dlm-4.3.0/dlm_tool/000077500000000000000000000000001461150666200141125ustar00rootroot00000000000000dlm-4.3.0/dlm_tool/Makefile000066400000000000000000000022601461150666200155520ustar00rootroot00000000000000DESTDIR= PREFIX=/usr BINDIR=$(PREFIX)/sbin MANDIR=$(PREFIX)/share/man BIN_TARGET = dlm_tool MAN_TARGET = dlm_tool.8 BIN_SOURCE = main.c CFLAGS += -D_GNU_SOURCE -O2 -ggdb \ -Wall -Wformat -Wformat-security -Wmissing-prototypes -Wnested-externs \ -Wpointer-arith -Wextra -Wshadow -Wcast-align -Wwrite-strings \ -Waggregate-return -Wstrict-prototypes -Winline -Wredundant-decls \ -Wno-sign-compare -Wno-unused-parameter -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions -fasynchronous-unwind-tables -fdiagnostics-show-option \ -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong \ -fstack-clash-protection CFLAGS += -fPIE -DPIE CFLAGS += -I../include -I../libdlm -I../dlm_controld LDFLAGS += -Wl,-z,relro -Wl,-z,now -pie LDFLAGS += -L../libdlm -L../dlm_controld LDFLAGS += -lpthread -ldlm -ldlmcontrol all: $(BIN_TARGET) $(BIN_TARGET): $(BIN_SOURCE) $(CC) $(BIN_SOURCE) $(CFLAGS) $(LDFLAGS) -o $@ clean: rm -f *.o *.so *.so.* $(BIN_TARGET) INSTALL=$(shell which install) .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -c -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ dlm-4.3.0/dlm_tool/dlm_tool.8000066400000000000000000000110131461150666200160100ustar00rootroot00000000000000.TH DLM_TOOL 8 2012-04-05 dlm dlm .SH NAME dlm_tool \- a utility for the dlm and dlm_controld daemon .SH SYNOPSIS .B dlm_tool [COMMAND] [OPTIONS] [ .I name ] .SH COMMANDS .B ls .br Display dlm_controld internal lockspace state. .B status .br Dump dlm_controld daemon state. .B dump .br Dump dlm_controld debug buffer. .B dump_config .br Dump dlm_controld config settings. .B reload_config .br Reload dlm_controld config settings from dlm.conf. .BI set_config " setting" = "value" .br Set dlm_controld config settings in the currently running daemon. .BI fence_ack " nodeid" .br Cancel a waiting fencing operation and consider it successful. .B log_plock .br Dump dlm_controld plock debug buffer. .BI plocks " lockspace_name" .br Dump posix locks from dlm_controld for the lockspace. .BI join " lockspace_name" .br Join a lockspace. .BI leave " lockspace_name" .br Leave a lockspace. .BI joinleave " lockspace_name" .br Join then immediately leave a lockspace (for testing only.) .BI lockdebug " lockspace_name" .br Complete display of locks from the lockspace. .BI lockdump " lockspace_name" .br Minimal display of locks from the lockspace (deprecated). .BI run " command" .br Run command and check for result (for lvmlockd only.) .BI run_start " command" .br Run command and don't check for result. .BI run_check " uuid" .br Check status of a command that was run. .BI run_cancel " uuid" .br Cancel a command that was run and is not yet done. .BI run_list .br Dump a list of running commands. .SH OPTIONS .B \-n Show all node information from dlm_tool ls. .B \-d 0|1 Resource directory off/on in join, default 0 .B \-e 0|1 Exclusive create off/on in join, default 0 .B \-f 0|1 FS (filesystem) flag off/on in join, default 0 .BI \-i " sec" Seconds to wait in run_check. .BI \-m " mode" Permission mode for lockspace device (octal), default 0600 .B \-s Summary following lockdebug output (experiemental) .B \-v Verbose lockdebug or status output .B \-w Wide lockdebug output .B \-M Include MSTCPY locks in lockdump output .B \-h Print help, then exit .B \-V Print program version information, then exit .SH USAGE .SS fence_ack See dlm_tool status for information about waiting fencing operations for specific nodeid's. .SS set_config dlm_tool set_config can change certain config settings in the currently running dlm_controld. Supported options: .nf daemon_debug log_debug debug_logfile plock_debug plock_rate_limit drop_resources_time drop_resources_count drop_resources_age post_join_delay enable_quorum_fencing enable_quorum_lockspace repeat_failed_fencing .fi Special cases to revert a previous set_config and restore the previous value (from default or dlm.conf): \fBdlm_tool set_config\fP \fIsetting\fP=\fBrestore\fP .br restores a single setting. \fBdlm_tool set_config restore_all\fP .br restores all settings. .SS dump_config A config setting may have been set from: the default, the dlm_controld command line, dlm.conf at startup, dlm.conf from reload_config, dlm_tool set_config. The dump_config output indicates how values were set: .TP setting=value default value. .TP setting=value (cli option) Set from a dlm_controld command line option. .TP setting=value (dlm.conf) Set from dlm.conf (at startup or reload.) .TP setting=value (set_config) Set from dlm_tool set_config. .SH EXAMPLES .SS dump_config .nf $ dlm_tool dump_config | head -n 5 daemon_debug=1 (set_config) foreground=0 log_debug=1 (dlm.conf) protocol=detect .fi In this case, daemon_debug is set by set_config, log_debug is set from dlm.conf, foreground and protocol are using default values. .SS set_config .nf $ dlm_tool dump_config | grep log_debug log_debug=1 (dlm.conf) $ dlm_tool set_config "log_debug=0" set_config done $ dlm_tool dump_config | grep log_debug log_debug=0 (set_config) $ dlm_tool set_config "log_debug=restore" set_config done $ dlm_tool dump_config | grep log_debug log_debug=1 (dlm.conf) .fi log_debug is set by dlm.conf (value is 1), then run set_config to change to 0, then use restore to restore to dlm.conf setting. .SS set_config .nf $ dlm_tool dump_config | grep _debug daemon_debug=0 log_debug=1 (dlm.conf) plock_debug=0 $ dlm_tool set_config "daemon_debug=1 log_debug=1 plock_debug=1" set_config done $ dlm_tool dump_config | grep _debug daemon_debug=1 (set_config) log_debug=1 (set_config) plock_debug=1 (set_config) $ dlm_tool set_config "restore_all" set_config done $ dlm_tool dump_config | grep _debug daemon_debug=0 log_debug=1 (dlm.conf) plock_debug=0 .fi .SH SEE ALSO .BR dlm_controld (8), .BR dlm.conf (5) dlm-4.3.0/dlm_tool/main.c000066400000000000000000001047211461150666200152070ustar00rootroot00000000000000/* * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libdlm.h" #include "libdlmcontrol.h" #include "copyright.cf" #include "version.cf" #define LKM_IVMODE -1 #define OP_JOIN 1 #define OP_LEAVE 2 #define OP_JOINLEAVE 3 #define OP_LIST 4 #define OP_DEADLOCK_CHECK 5 #define OP_DUMP 6 #define OP_PLOCKS 7 #define OP_LOCKDUMP 8 #define OP_LOCKDEBUG 9 #define OP_LOG_PLOCK 10 #define OP_FENCE_ACK 11 #define OP_STATUS 12 #define OP_DUMP_CONFIG 13 #define OP_RUN 14 #define OP_RUN_START 15 #define OP_RUN_CHECK 16 #define OP_RUN_CANCEL 17 #define OP_RUN_LIST 18 #define OP_DUMP_RUN 19 #define OP_RELOAD_CONFIG 20 #define OP_SET_CONFIG 21 static char *prog_name; static char *lsname; static int operation; static int opt_ind; static int ls_all_nodes = 0; static int opt_excl = 0; static int opt_fs = 0; static int dump_mstcpy = 0; static mode_t create_mode = 0600; static int verbose; static int wide; static int wait_sec; static int summarize; char run_command[DLMC_RUN_COMMAND_LEN]; char run_uuid[DLMC_RUN_UUID_LEN]; #define MAX_NODES 128 /* from linux/fs/dlm/dlm_internal.h */ #define DLM_LKSTS_WAITING 1 #define DLM_LKSTS_GRANTED 2 #define DLM_LKSTS_CONVERT 3 #define DLM_MSG_REQUEST 1 #define DLM_MSG_CONVERT 2 #define DLM_MSG_UNLOCK 3 #define DLM_MSG_CANCEL 4 #define DLM_MSG_REQUEST_REPLY 5 #define DLM_MSG_CONVERT_REPLY 6 #define DLM_MSG_UNLOCK_REPLY 7 #define DLM_MSG_CANCEL_REPLY 8 #define DLM_MSG_GRANT 9 #define DLM_MSG_BAST 10 #define DLM_MSG_LOOKUP 11 #define DLM_MSG_REMOVE 12 #define DLM_MSG_LOOKUP_REPLY 13 #define DLM_MSG_PURGE 14 struct dlmc_node nodes[MAX_NODES]; struct rinfo { int print_granted; int print_convert; int print_waiting; int print_lookup; int namelen; int nodeid; int lvb; unsigned int lkb_count; unsigned int lkb_granted; unsigned int lkb_convert; unsigned int lkb_waiting; unsigned int lkb_lookup; unsigned int lkb_wait_msg; unsigned int lkb_master_copy; unsigned int lkb_local_copy; unsigned int lkb_process_copy; }; struct summary { unsigned int rsb_total; unsigned int rsb_with_lvb; unsigned int rsb_no_locks; unsigned int rsb_lookup; unsigned int rsb_master; unsigned int rsb_local; unsigned int rsb_nodeid_error; unsigned int lkb_count; unsigned int lkb_granted; unsigned int lkb_convert; unsigned int lkb_waiting; unsigned int lkb_lookup; unsigned int lkb_wait_msg; unsigned int lkb_master_copy; unsigned int lkb_local_copy; unsigned int lkb_process_copy; unsigned int expect_replies; unsigned int toss_total; }; static const char *mode_str(int mode) { switch (mode) { case -1: return "IV"; case LKM_NLMODE: return "NL"; case LKM_CRMODE: return "CR"; case LKM_CWMODE: return "CW"; case LKM_PRMODE: return "PR"; case LKM_PWMODE: return "PW"; case LKM_EXMODE: return "EX"; } return "??"; } static const char *msg_str(int type) { switch (type) { case DLM_MSG_REQUEST: return "request"; case DLM_MSG_CONVERT: return "convert"; case DLM_MSG_UNLOCK: return "unlock "; case DLM_MSG_CANCEL: return "cancel "; case DLM_MSG_REQUEST_REPLY: return "r_reply"; case DLM_MSG_CONVERT_REPLY: return "c_reply"; case DLM_MSG_UNLOCK_REPLY: return "u_reply"; case DLM_MSG_CANCEL_REPLY: return "c_reply"; case DLM_MSG_GRANT: return "grant "; case DLM_MSG_BAST: return "bast "; case DLM_MSG_LOOKUP: return "lookup "; case DLM_MSG_REMOVE: return "remove "; case DLM_MSG_LOOKUP_REPLY: return "l_reply"; case DLM_MSG_PURGE: return "purge "; default: return "unknown"; } } static void print_usage(void) { printf("Usage:\n"); printf("\n"); printf("dlm_tool [command] [options] [name]\n"); printf("\n"); printf("Commands:\n"); printf("ls, status, dump, fence_ack\n"); printf("dump_config, reload_config, set_config\n"); printf("log_plock, plocks\n"); printf("join, leave, lockdebug\n"); printf("run, run_start, run_check, run_cancel, run_list\n"); printf("\n"); printf("Options:\n"); printf(" -n Show all node information in ls\n"); printf(" -e 0|1 Exclusive create off/on in join, default 0\n"); printf(" -f 0|1 FS (filesystem) flag off/on in join, default 0\n"); printf(" -m Permission mode for lockspace device (octal), default 0600\n"); printf(" -s Summary following lockdebug output (experimental)\n"); printf(" -v Verbose lockdebug output\n"); printf(" -w Wide lockdebug output\n"); printf(" -i Wait for .\n"); printf(" -h Print help, then exit\n"); printf(" -V Print program version information, then exit\n"); printf("\n"); } #define OPTION_STRING "MhVnm:e:f:vwsi:" static void decode_arguments(int argc, char **argv) { int cont = 1; int optchar; int need_lsname; int need_command = 0; int need_uuid = 0; int optional_lsname = 0; int i; char modebuf[8]; while (cont) { optchar = getopt(argc, argv, OPTION_STRING); switch (optchar) { case 'e': opt_excl = atoi(optarg); break; case 'f': opt_fs = atoi(optarg); break; case 'm': memset(modebuf, 0, sizeof(modebuf)); snprintf(modebuf, 8, "%s", optarg); sscanf(modebuf, "%o", &create_mode); break; case 'M': dump_mstcpy = 1; break; case 'n': ls_all_nodes = 1; break; case 's': summarize = 1; break; case 'v': verbose = 1; break; case 'w': wide = 1; break; case 'i': wait_sec = atoi(optarg); break; case 'h': print_usage(); exit(EXIT_SUCCESS); break; case 'V': printf("%s %s (built %s %s)\n", prog_name, RELEASE_VERSION, __DATE__, __TIME__); printf("%s\n", REDHAT_COPYRIGHT); exit(EXIT_SUCCESS); break; case ':': case '?': fprintf(stderr, "Please use '-h' for usage.\n"); exit(EXIT_FAILURE); break; case EOF: cont = 0; break; default: fprintf(stderr, "unknown option: %c\n", optchar); exit(EXIT_FAILURE); break; }; } need_lsname = 1; while (optind < argc) { /* * libdlm */ if (!strncmp(argv[optind], "join", 4) && (strlen(argv[optind]) == 4)) { operation = OP_JOIN; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "leave", 5) && (strlen(argv[optind]) == 5)) { operation = OP_LEAVE; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "joinleave", 9) && (strlen(argv[optind]) == 9)) { operation = OP_JOINLEAVE; opt_ind = optind + 1; break; } /* * libdlmcontrol */ else if (!strncmp(argv[optind], "ls", 2) && (strlen(argv[optind]) == 2)) { operation = OP_LIST; opt_ind = optind + 1; need_lsname = 0; optional_lsname = 1; break; } else if (!strncmp(argv[optind], "status", 6) && (strlen(argv[optind]) == 6)) { operation = OP_STATUS; opt_ind = optind + 1; need_lsname = 0; break; } else if (!strncmp(argv[optind], "deadlock_check", 14) && (strlen(argv[optind]) == 14)) { operation = OP_DEADLOCK_CHECK; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "fence_ack", 9) && (strlen(argv[optind]) == 9)) { operation = OP_FENCE_ACK; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "dump", 4) && (strlen(argv[optind]) == 4)) { operation = OP_DUMP; opt_ind = optind + 1; need_lsname = 0; break; } else if (!strncmp(argv[optind], "dump_config", 11) && (strlen(argv[optind]) == 11)) { operation = OP_DUMP_CONFIG; opt_ind = optind + 1; need_lsname = 0; break; } else if (!strncmp(argv[optind], "reload_config", 13) && (strlen(argv[optind]) == 13)) { operation = OP_RELOAD_CONFIG; opt_ind = optind + 1; need_lsname = 0; break; } else if (!strncmp(argv[optind], "set_config", 10) && (strlen(argv[optind]) == 10)) { operation = OP_SET_CONFIG; opt_ind = optind + 1; need_lsname = 0; need_command = 1; break; } else if (!strncmp(argv[optind], "plocks", 6) && (strlen(argv[optind]) == 6)) { operation = OP_PLOCKS; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "log_plock", 9) && (strlen(argv[optind]) == 9)) { operation = OP_LOG_PLOCK; opt_ind = optind + 1; need_lsname = 0; break; } else if (!strcmp(argv[optind], "run")) { operation = OP_RUN; opt_ind = optind + 1; need_lsname = 0; need_command = 1; break; } else if (!strcmp(argv[optind], "run_start")) { operation = OP_RUN_START; opt_ind = optind + 1; need_lsname = 0; need_command = 1; break; } else if (!strcmp(argv[optind], "run_check")) { operation = OP_RUN_CHECK; opt_ind = optind + 1; need_lsname = 0; need_uuid = 1; break; } else if (!strcmp(argv[optind], "run_cancel")) { operation = OP_RUN_CANCEL; opt_ind = optind + 1; need_lsname = 0; need_uuid = 1; break; } else if (!strcmp(argv[optind], "run_list")) { operation = OP_DUMP_RUN; opt_ind = optind + 1; need_lsname = 0; need_uuid = 0; break; } /* * debugfs */ else if (!strncmp(argv[optind], "lockdump", 8) && (strlen(argv[optind]) == 8)) { operation = OP_LOCKDUMP; opt_ind = optind + 1; break; } else if (!strncmp(argv[optind], "lockdebug", 9) && (strlen(argv[optind]) == 9)) { operation = OP_LOCKDEBUG; opt_ind = optind + 1; break; } optind++; } if (!operation || !opt_ind) { print_usage(); exit(EXIT_FAILURE); } /* * The operation does not require an arg, but may * accept an optional arg, i.e. dlm_tool ls [name] */ if (!need_lsname && !need_uuid && !need_command) { if (optind < argc - 1) { if (optional_lsname) lsname = argv[opt_ind]; } return; } if (optind < argc - 1) { if (need_lsname) lsname = argv[opt_ind]; else if (need_uuid) { strncpy(run_uuid, argv[opt_ind], DLMC_RUN_UUID_LEN - 1); run_uuid[DLMC_RUN_UUID_LEN - 1] = '\0'; } else if (need_command) goto copy_command; } else if (need_lsname) { if (operation == OP_FENCE_ACK) fprintf(stderr, "nodeid required\n"); else fprintf(stderr, "lockspace name required\n"); exit(EXIT_FAILURE); } else if (need_command) { fprintf(stderr, "command required\n"); exit(EXIT_FAILURE); } else if (need_uuid) { fprintf(stderr, "uuid required\n"); exit(EXIT_FAILURE); } else { fprintf(stderr, "missing arg\n"); exit(EXIT_FAILURE); } copy_command: for (i = opt_ind; i < argc; i++) { if (strlen(run_command) + strlen(argv[i]) + 2 > DLMC_RUN_COMMAND_LEN) { fprintf(stderr, "command too long (max %d)\n", DLMC_RUN_COMMAND_LEN); exit(EXIT_FAILURE); } if (strlen(argv[i])) { strcat(run_command, argv[i]); strcat(run_command, " "); } } } static int do_write(int fd, void *buf, size_t count) { int rv, off = 0; retry: rv = write(fd, (char *)buf + off, count); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) return rv; if (rv != count) { count -= rv; off += rv; goto retry; } return 0; } static char *flag_str(uint32_t flags) { static char join_flags[128]; memset(join_flags, 0, sizeof(join_flags)); strcat(join_flags, "flags "); if (flags & DLM_LSFL_NEWEXCL) strcat(join_flags, "NEWEXCL "); if (flags & DLM_LSFL_FS) strcat(join_flags, "FS "); return join_flags; } static void do_join(char *name) { dlm_lshandle_t *dh; uint32_t flags = 0; if (opt_excl) flags |= DLM_LSFL_NEWEXCL; if (opt_fs) flags |= DLM_LSFL_FS; printf("Joining lockspace \"%s\" permission %o %s\n", name, create_mode, flags ? flag_str(flags) : ""); fflush(stdout); dh = dlm_new_lockspace(name, create_mode, flags); if (!dh) { fprintf(stderr, "dlm_new_lockspace %s error %d\n", name, errno); exit(-1); } dlm_close_lockspace(dh); /* there's no autofree so the ls should stay around */ printf("done\n"); } static void do_leave(char *name) { dlm_lshandle_t *dh; printf("Leaving lockspace \"%s\"\n", name); fflush(stdout); dh = dlm_open_lockspace(name); if (!dh) { fprintf(stderr, "dlm_open_lockspace %s error %p %d\n", name, dh, errno); exit(-1); } dlm_release_lockspace(name, dh, 1); printf("done\n"); } static char *pr_master(int nodeid) { static char buf[64]; memset(buf, 0, sizeof(buf)); if (nodeid > 0) sprintf(buf, "Master:%d", nodeid); else if (!nodeid) sprintf(buf, "Master"); else if (nodeid == -1) sprintf(buf, "Lookup"); return buf; } static char *pr_extra(uint32_t flags, int root_list, int recover_list, int recover_locks_count, char *first_lkid) { static char buf[128]; int first = 0; memset(buf, 0, sizeof(buf)); if (strcmp(first_lkid, "0")) first = 1; if (flags || first || root_list || recover_list || recover_locks_count) sprintf(buf, "flags %08x first_lkid %s root %d recover %d locks %d", flags, first_lkid, root_list, recover_list, recover_locks_count); return buf; } static void print_rsb(char *line, struct rinfo *ri) { char type[4], namefmt[4], *p; char addr[64]; char first_lkid[64]; int rv, nodeid, root_list, recover_list, recover_locks_count, namelen; uint32_t flags; rv = sscanf(line, "%s %s %d %s %u %d %d %u %u %s", type, addr, &nodeid, first_lkid, &flags, &root_list, &recover_list, &recover_locks_count, &namelen, namefmt); if (rv != 10) goto fail; /* used for lkb prints */ ri->nodeid = nodeid; ri->namelen = namelen; p = strchr(line, '\n'); if (!p) goto fail; *p = '\0'; p = strstr(line, namefmt); if (!p) goto fail; p += 4; strcat(addr, " "); if (!strncmp(namefmt, "str", 3)) printf("Resource len %2d \"%s\"\n", namelen, p); else if (!strncmp(namefmt, "hex", 3)) printf("Resource len %2d hex %s\n", namelen, p); else goto fail; printf("%-16s %s\n", pr_master(nodeid), pr_extra(flags, root_list, recover_list, recover_locks_count, first_lkid)); return; fail: fprintf(stderr, "print_rsb error rv %d line \"%s\"\n", rv, line); } static void print_lvb(char *line) { char lvb[1024]; char type[4]; int i, c, rv, lvblen; uint32_t lvbseq; memset(lvb, 0, 1024); rv = sscanf(line, "%s %u %d %[0-9A-Fa-f ]", type, &lvbseq, &lvblen, lvb); if (rv != 4) { fprintf(stderr, "print_lvb error rv %d line \"%s\"\n", rv, line); return; } printf("LVB len %d seq %u\n", lvblen, lvbseq); for (c = 0, i = 0; ; i++) { printf("%c", lvb[i]); if (lvb[i] != ' ') c++; if (!wide && lvb[i] == ' ' && !(c % 32)) printf("\n"); if (c == (lvblen * 2)) break; } printf("\n"); } struct lkb { uint64_t xid, timestamp, time_bast; uint32_t id, remid, exflags, flags, lvbseq; int nodeid, ownpid, status, grmode, rqmode, highbast, rsb_lookup, wait_type; }; static const char *pr_grmode(struct lkb *lkb) { if (lkb->status == DLM_LKSTS_GRANTED || lkb->status == DLM_LKSTS_CONVERT) return mode_str(lkb->grmode); else if (lkb->status == DLM_LKSTS_WAITING || lkb->rsb_lookup) return "--"; else return "XX"; } static const char *pr_rqmode(struct lkb *lkb) { static char buf[5]; memset(buf, 0, sizeof(buf)); if (lkb->status == DLM_LKSTS_GRANTED) { return " "; } else if (lkb->status == DLM_LKSTS_CONVERT || lkb->status == DLM_LKSTS_WAITING || lkb->rsb_lookup) { sprintf(buf, "(%s)", mode_str(lkb->rqmode)); return buf; } else { return "(XX)"; } } static const char *pr_remote(struct lkb *lkb, struct rinfo *ri) { static char buf[64]; memset(buf, 0, sizeof(buf)); if (!lkb->nodeid) { return " "; } else if (lkb->nodeid != ri->nodeid) { sprintf(buf, "Remote: %3d %08x", lkb->nodeid, lkb->remid); return buf; } else { sprintf(buf, "Master: %3d %08x", lkb->nodeid, lkb->remid); return buf; } } static const char *pr_wait(struct lkb *lkb) { static char buf[16]; memset(buf, 0, sizeof(buf)); if (!lkb->wait_type) { return " "; } else { sprintf(buf, " wait %02d", lkb->wait_type); return buf; } } static char *pr_verbose(struct lkb *lkb) { static char buf[128]; memset(buf, 0, sizeof(buf)); sprintf(buf, "time %016llu flags %08x %08x bast %d %llu pid %d", (unsigned long long)lkb->timestamp, lkb->exflags, lkb->flags, lkb->highbast, (unsigned long long)lkb->time_bast, lkb->ownpid); return buf; } static void print_lkb(char *line, struct rinfo *ri) { struct lkb lkb; char type[4]; sscanf(line, "%s %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu", type, &lkb.id, &lkb.nodeid, &lkb.remid, &lkb.ownpid, (unsigned long long *)&lkb.xid, &lkb.exflags, &lkb.flags, &lkb.status, &lkb.grmode, &lkb.rqmode, &lkb.highbast, &lkb.rsb_lookup, &lkb.wait_type, &lkb.lvbseq, (unsigned long long *)&lkb.timestamp, (unsigned long long *)&lkb.time_bast); ri->lkb_count++; if (lkb.status == DLM_LKSTS_GRANTED) { if (!ri->print_granted++) printf("Granted\n"); ri->lkb_granted++; } if (lkb.status == DLM_LKSTS_CONVERT) { if (!ri->print_convert++) printf("Convert\n"); ri->lkb_convert++; } if (lkb.status == DLM_LKSTS_WAITING) { if (!ri->print_waiting++) printf("Waiting\n"); ri->lkb_waiting++; } if (lkb.rsb_lookup) { if (!ri->print_lookup++) printf("Lookup\n"); ri->lkb_lookup++; } if (lkb.wait_type) ri->lkb_wait_msg++; if (!ri->nodeid) { if (lkb.nodeid) ri->lkb_master_copy++; else ri->lkb_local_copy++; } else { ri->lkb_process_copy++; } printf("%08x %s %s %s %s %s\n", lkb.id, pr_grmode(&lkb), pr_rqmode(&lkb), pr_remote(&lkb, ri), pr_wait(&lkb), (verbose && wide) ? pr_verbose(&lkb) : ""); if (verbose && !wide) printf("%s\n", pr_verbose(&lkb)); } static void print_rsb_toss(char *line) { char type[4], namefmt[4], *p; char addr[64]; char toss_time[16]; int res_nodeid, master_nodeid, dir_nodeid, our_nodeid; int rv, namelen; uint32_t flags; rv = sscanf(line, "%s %s %d %d %d %d %s %u %u %s", type, addr, &res_nodeid, &master_nodeid, &dir_nodeid, &our_nodeid, toss_time, &flags, &namelen, namefmt); if (rv != 10) goto fail; p = strchr(line, '\n'); if (!p) goto fail; *p = '\0'; p = strstr(line, namefmt); if (!p) goto fail; p += 4; strcat(addr, " "); if (!strncmp(namefmt, "str", 3)) printf("Resource len %2d \"%s\"\n", namelen, p); else if (!strncmp(namefmt, "hex", 3)) printf("Resource len %2d hex %s\n", namelen, p); else goto fail; if (master_nodeid != our_nodeid) printf("Master:%d", master_nodeid); else printf("Master"); if (dir_nodeid != our_nodeid) printf(" Dir:%d", dir_nodeid); else printf(" Dir"); if (master_nodeid == our_nodeid && res_nodeid != 0) printf(" res_nodeid %d", res_nodeid); printf("\n"); return; fail: fprintf(stderr, "print_rsb_toss error rv %d line \"%s\"\n", rv, line); } static void clear_rinfo(struct rinfo *ri) { memset(ri, 0, sizeof(struct rinfo)); ri->nodeid = -9; } static void count_rinfo(struct summary *s, struct rinfo *ri) { /* the first time called */ if (!ri->namelen) return; s->rsb_total++; if (ri->lvb) s->rsb_with_lvb++; if (!ri->lkb_count) { s->rsb_no_locks++; printf("no locks\n"); } if (!ri->nodeid) s->rsb_master++; else if (ri->nodeid == -1) s->rsb_lookup++; else if (ri->nodeid > 0) s->rsb_local++; else s->rsb_nodeid_error++; s->lkb_count += ri->lkb_count; s->lkb_granted += ri->lkb_granted; s->lkb_convert += ri->lkb_convert; s->lkb_waiting += ri->lkb_waiting; s->lkb_lookup += ri->lkb_lookup; s->lkb_wait_msg += ri->lkb_wait_msg; s->lkb_master_copy += ri->lkb_master_copy; s->lkb_local_copy += ri->lkb_local_copy; s->lkb_process_copy += ri->lkb_process_copy; } static void print_summary(struct summary *s) { printf("rsb\n"); printf(" active %u\n", s->rsb_total); printf(" master %u\n", s->rsb_master); printf(" remote master %u\n", s->rsb_local); printf(" lookup master %u\n", s->rsb_lookup); printf(" with lvb %u\n", s->rsb_with_lvb); printf(" with no locks %u\n", s->rsb_no_locks); printf(" nodeid error %u\n", s->rsb_nodeid_error); printf(" inactive %u\n", s->toss_total); printf("\n"); printf("lkb\n"); printf(" total %u\n", s->lkb_count); printf(" granted %u\n", s->lkb_granted); printf(" convert %u\n", s->lkb_convert); printf(" waiting %u\n", s->lkb_waiting); printf(" local copy %u\n", s->lkb_local_copy); printf(" master copy %u\n", s->lkb_master_copy); printf(" process copy %u\n", s->lkb_process_copy); printf(" rsb lookup %u\n", s->lkb_lookup); printf(" wait message %u\n", s->lkb_wait_msg); printf(" expect reply %u\n", s->expect_replies); } #define LOCK_LINE_MAX 1024 static void do_waiters(char *name, struct summary *sum) { FILE *file; char path[PATH_MAX]; char line[LOCK_LINE_MAX]; char rname[DLM_RESNAME_MAXLEN+1]; int header = 0; int i, j, spaces; int rv, nodeid, wait_type; uint32_t id; snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_waiters", name); file = fopen(path, "r"); if (!file) return; while (fgets(line, LOCK_LINE_MAX, file)) { if (!header) { printf("\n"); printf("Expecting reply\n"); header = 1; } rv = sscanf(line, "%x %d %d", &id, &wait_type, &nodeid); if (rv != 3) { printf("waiters: %s", line); continue; } /* parse the resource name from the remainder of the line */ j = 0; spaces = 0; memset(rname, 0, sizeof(rname)); for (i = 0; i < LOCK_LINE_MAX; i++) { if (line[i] == '\n') break; if (spaces == 3) { rname[j++] = line[i]; if (j == (sizeof(rname) - 1)) break; } else if (line[i] == ' ') { spaces++; } } printf("nodeid %2d msg %s lkid %08x resource \"%s\"\n", nodeid, msg_str(wait_type), id, rname); sum->expect_replies++; } fclose(file); } static void do_toss(char *name, struct summary *sum) { FILE *file; char path[PATH_MAX]; char line[LOCK_LINE_MAX]; snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_toss", name); file = fopen(path, "r"); if (!file) return; while (fgets(line, LOCK_LINE_MAX, file)) { if (!strncmp(line, "version", 7)) continue; if (!strncmp(line, "rsb", 3)) { print_rsb_toss(line); sum->toss_total++; printf("\n"); } } fclose(file); } static void do_lockdebug(char *name) { struct summary summary; struct rinfo info; FILE *file; char path[PATH_MAX]; char line[LOCK_LINE_MAX]; int old = 0; snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_all", name); file = fopen(path, "r"); if (!file) { snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s", name); file = fopen(path, "r"); if (!file) { fprintf(stderr, "can't open %s: %s\n", path, strerror(errno)); return; } old = 1; } memset(&summary, 0, sizeof(struct summary)); memset(&info, 0, sizeof(struct rinfo)); /* skip file header */ if (!fgets(line, LOCK_LINE_MAX, file)) goto done; while (fgets(line, LOCK_LINE_MAX, file)) { if (old) goto raw; if (!strncmp(line, "version", 7)) continue; if (!strncmp(line, "rsb", 3)) { count_rinfo(&summary, &info); clear_rinfo(&info); printf("\n"); print_rsb(line, &info); continue; } if (!strncmp(line, "lvb", 3)) { print_lvb(line); info.lvb = 1; continue; } if (!strncmp(line, "lkb", 3)) { print_lkb(line, &info); continue; } raw: printf("%s", line); } done: count_rinfo(&summary, &info); clear_rinfo(&info); printf("\n"); fclose(file); do_toss(name, &summary); do_waiters(name, &summary); if (summarize) { printf("\n"); print_summary(&summary); } } static void parse_r_name(char *line, char *name) { char *p; int i = 0; int begin = 0; for (p = line; ; p++) { if (*p == '"') { if (begin) break; begin = 1; continue; } if (begin) name[i++] = *p; } } static void do_lockdump(char *name) { FILE *file; char path[PATH_MAX]; char line[LOCK_LINE_MAX]; char r_name[DLM_RESNAME_MAXLEN+1]; int r_nodeid; int r_len; int rv; unsigned int tm; unsigned long long xid; uint32_t id; int nodeid; uint32_t remid; int ownpid; uint32_t exflags; uint32_t flags; int8_t status; int8_t grmode; int8_t rqmode; snprintf(path, PATH_MAX, "/sys/kernel/debug/dlm/%s_locks", name); file = fopen(path, "r"); if (!file) { fprintf(stderr, "can't open %s: %s\n", path, strerror(errno)); return; } /* skip the header on the first line */ if (!fgets(line, LOCK_LINE_MAX, file)) { goto out; } while (fgets(line, LOCK_LINE_MAX, file)) { rv = sscanf(line, "%x %d %x %u %llu %x %x %hhd %hhd %hhd %u %d %d", &id, &nodeid, &remid, &ownpid, &xid, &exflags, &flags, &status, &grmode, &rqmode, &tm, &r_nodeid, &r_len); if (rv != 13) { fprintf(stderr, "invalid debugfs line %d: %s\n", rv, line); goto out; } memset(r_name, 0, sizeof(r_name)); parse_r_name(line, r_name); /* don't print MSTCPY locks without -M */ if (!r_nodeid && nodeid) { if (!dump_mstcpy) continue; printf("id %08x gr %s rq %s pid %u MSTCPY %d \"%s\"\n", id, mode_str(grmode), mode_str(rqmode), ownpid, nodeid, r_name); continue; } /* A hack because dlm-kernel doesn't set rqmode back to IV when a NOQUEUE convert fails, which means in a lockdump it looks like a granted lock is still converting since rqmode is not IV. (does it make sense to include status in the output, e.g. G,C,W?) */ if (status == DLM_LKSTS_GRANTED) rqmode = LKM_IVMODE; printf("id %08x gr %s rq %s pid %u master %d \"%s\"\n", id, mode_str(grmode), mode_str(rqmode), ownpid, nodeid, r_name); } out: fclose(file); } static char *dlmc_lf_str(uint32_t flags) { static char str[128]; int i = 0; memset(str, 0, sizeof(str)); if (flags & DLMC_LF_SAVE_PLOCKS) { i++; strcat(str, "save_plock"); } if (flags & DLMC_LF_NEED_PLOCKS) { strcat(str, i++ ? "," : ""); strcat(str, "need_plock"); } if (flags & DLMC_LF_FS_REGISTERED) { strcat(str, i++ ? "," : ""); strcat(str, "fs_reg"); } if (flags & DLMC_LF_KERNEL_STOPPED) { strcat(str, i++ ? "," : ""); strcat(str, "kern_stop"); } if (flags & DLMC_LF_LEAVING) { strcat(str, i++ ? "," : ""); strcat(str, "leave"); } if (flags & DLMC_LF_JOINING) { strcat(str, i++ ? "," : ""); strcat(str, "join"); } return str; } static const char *check_str(struct dlmc_node *n) { static char _check_str[32]; if (n->flags & DLMC_NF_NEED_FENCING) { memset(_check_str, 0, sizeof(_check_str)); snprintf(_check_str, sizeof(_check_str) - 1, "fence %llu", (unsigned long long)n->fail_monotime); return _check_str; } if (n->flags & DLMC_NF_CHECK_FS) return "fs"; return "none"; } static const char *condition_str(int cond) { switch (cond) { case DLMC_LS_WAIT_RINGID: return "ringid"; case DLMC_LS_WAIT_QUORUM: return "quorum"; case DLMC_LS_WAIT_FENCING: return "fencing"; case DLMC_LS_WAIT_FSDONE: return "fsdone"; default: return "unknown"; } } static int node_compare(const void *va, const void *vb) { const struct dlmc_node *a = va; const struct dlmc_node *b = vb; return a->nodeid - b->nodeid; } static void show_nodeids(int count, struct dlmc_node *nodes_in) { struct dlmc_node *n = nodes_in; int i; for (i = 0; i < count; i++) { printf("%d ", n->nodeid); n++; } printf("\n"); } static void show_ls(struct dlmc_lockspace *ls) { int rv, node_count; printf("name %s\n", ls->name); printf("id 0x%08x\n", ls->global_id); printf("flags 0x%08x %s\n", ls->flags, dlmc_lf_str(ls->flags)); printf("change member %d joined %d remove %d failed %d seq %d,%d\n", ls->cg_prev.member_count, ls->cg_prev.joined_count, ls->cg_prev.remove_count, ls->cg_prev.failed_count, ls->cg_prev.combined_seq, ls->cg_prev.seq); node_count = 0; memset(&nodes, 0, sizeof(nodes)); rv = dlmc_lockspace_nodes(ls->name, DLMC_NODES_MEMBERS, MAX_NODES, &node_count, nodes); if (rv < 0) { printf("members error\n"); goto next; } qsort(nodes, node_count, sizeof(struct dlmc_node), node_compare); printf("members "); show_nodeids(node_count, nodes); next: if (!ls->cg_next.seq) return; printf("new change member %d joined %d remove %d failed %d seq %d,%d\n", ls->cg_next.member_count, ls->cg_next.joined_count, ls->cg_next.remove_count, ls->cg_next.failed_count, ls->cg_next.combined_seq, ls->cg_next.seq); if (ls->cg_next.wait_messages) printf("new status wait messages %d\n", ls->cg_next.wait_condition); else printf("new status wait %s\n", condition_str(ls->cg_next.wait_condition)); node_count = 0; memset(&nodes, 0, sizeof(nodes)); rv = dlmc_lockspace_nodes(ls->name, DLMC_NODES_NEXT, MAX_NODES, &node_count, nodes); if (rv < 0) { printf("new members error\n"); return; } qsort(nodes, node_count, sizeof(struct dlmc_node), node_compare); printf("new members "); show_nodeids(node_count, nodes); } static int member_int(struct dlmc_node *n) { if (n->flags & DLMC_NF_DISALLOWED) return -1; if (n->flags & DLMC_NF_MEMBER) return 1; return 0; } static void show_all_nodes(int count, struct dlmc_node *nodes_in) { struct dlmc_node *n = nodes_in; int i; for (i = 0; i < count; i++) { printf("nodeid %d member %d failed %d start %d seq_add %u seq_rem %u check %s\n", n->nodeid, member_int(n), n->fail_reason, (n->flags & DLMC_NF_START) ? 1 : 0, n->added_seq, n->removed_seq, check_str(n)); n++; } } static void do_list(char *name) { struct dlmc_lockspace *lss; struct dlmc_lockspace *ls; int node_count; int ls_count; int rv; int i; if (name) { /* get only one specific lockspace by name */ ls_count = 1; lss = malloc(sizeof(struct dlmc_lockspace)); if (!lss) exit(EXIT_FAILURE); rv = dlmc_lockspace_info(name, lss); } else { rv = dlmc_lockspaces(&ls_count, &lss); } if (rv < 0) exit(EXIT_FAILURE); /* dlm_controld probably not running */ if (ls_count) printf("dlm lockspaces\n"); for (i = 0; i < ls_count; i++) { ls = &lss[i]; show_ls(ls); if (!ls_all_nodes) goto next; node_count = 0; memset(&nodes, 0, sizeof(nodes)); rv = dlmc_lockspace_nodes(ls->name, DLMC_NODES_ALL, MAX_NODES, &node_count, nodes); if (rv < 0) { printf("all nodes error %d %d\n", rv, errno); goto next; } qsort(nodes, node_count, sizeof(struct dlmc_node),node_compare); printf("all nodes\n"); show_all_nodes(node_count, nodes); next: printf("\n"); } free(lss); } static void do_deadlock_check(char *name) { dlmc_deadlock_check(name); } static void do_fence_ack(char *name) { dlmc_fence_ack(name); } static int do_plocks(char *name) { char buf[DLMC_DUMP_SIZE]; int rv, data; memset(buf, 0, sizeof(buf)); rv = dlmc_dump_plocks(name, buf, &data); if (rv) return rv; else if (data) return data; buf[DLMC_DUMP_SIZE-1] = '\0'; do_write(STDOUT_FILENO, buf, strlen(buf)); return 0; } static int do_dump(int op) { int rv = -EOPNOTSUPP, data; char buf[DLMC_DUMP_SIZE]; memset(buf, 0, sizeof(buf)); if (op == OP_DUMP) rv = dlmc_dump_debug(buf, &data); else if (op == OP_DUMP_CONFIG) rv = dlmc_dump_config(buf, &data); else if (op == OP_DUMP_RUN) rv = dlmc_dump_run(buf, &data); if (rv) return rv; else if (data) return data; buf[DLMC_DUMP_SIZE-1] = '\0'; do_write(STDOUT_FILENO, buf, strlen(buf)); printf("\n"); return 0; } static void do_reload_config(void) { if (dlmc_reload_config() < 0) printf("reload_config failed\n"); else printf("reload_config done\n"); } static void do_set_config(void) { if (dlmc_set_config(run_command) < 0) printf("set_config failed\n"); else printf("set_config done\n"); } static int do_log_plock(void) { char buf[DLMC_DUMP_SIZE]; int rv, data; memset(buf, 0, sizeof(buf)); rv = dlmc_dump_log_plock(buf, &data); if (rv) return rv; else if (data) return data; buf[DLMC_DUMP_SIZE-1] = '\0'; do_write(STDOUT_FILENO, buf, strlen(buf)); printf("\n"); return 0; } static int do_run(int op) { int do_start = (op == OP_RUN) || (op == OP_RUN_START); int do_check = (op == OP_RUN) || (op == OP_RUN_CHECK); int do_cancel = (op == OP_RUN_CANCEL); uint32_t flags = 0; uint32_t check_status = 0; int rv = 0; if (do_start) { /* FIXME: use proper option to specify */ flags = DLMC_FLAG_RUN_START_NODE_NONE; if (ls_all_nodes) flags = DLMC_FLAG_RUN_START_NODE_RECV; rv = dlmc_run_start(run_command, strlen(run_command), 0, flags, run_uuid); printf("run_uuid: %s\n", run_uuid); } if (do_check || do_cancel) { if (do_cancel) flags = DLMC_FLAG_RUN_CHECK_CANCEL; else flags = DLMC_FLAG_RUN_CHECK_CLEAR; rv = dlmc_run_check(run_uuid, strlen(run_uuid), wait_sec, flags, &check_status); printf("check_status: "); if (check_status & DLMC_RUN_STATUS_WAITING) printf("waiting "); if (check_status & DLMC_RUN_STATUS_DONE) printf("done "); if (check_status & DLMC_RUN_STATUS_FAILED) printf("failed "); printf("\n"); } return rv; } int main(int argc, char **argv) { prog_name = argv[0]; decode_arguments(argc, argv); int rv = 0; switch (operation) { /* calls to libdlm; pass a command to dlm-kernel */ case OP_JOIN: do_join(lsname); break; case OP_LEAVE: do_leave(lsname); break; case OP_JOINLEAVE: do_join(lsname); do_leave(lsname); break; /* calls to libdlmcontrol; pass a command/query to dlm_controld */ case OP_LIST: do_list(lsname); break; case OP_STATUS: dlmc_print_status(verbose ? DLMC_STATUS_VERBOSE : 0); break; case OP_DUMP: rv = do_dump(operation); break; case OP_DUMP_CONFIG: rv = do_dump(operation); break; case OP_RELOAD_CONFIG: do_reload_config(); break; case OP_SET_CONFIG: do_set_config(); break; case OP_LOG_PLOCK: rv = do_log_plock(); break; case OP_PLOCKS: rv = do_plocks(lsname); break; case OP_DEADLOCK_CHECK: do_deadlock_check(lsname); break; /* calls to read debugfs; query info from dlm-kernel */ case OP_LOCKDUMP: do_lockdump(lsname); break; case OP_LOCKDEBUG: do_lockdebug(lsname); break; case OP_FENCE_ACK: do_fence_ack(lsname); break; case OP_RUN: case OP_RUN_START: case OP_RUN_CHECK: case OP_RUN_CANCEL: return do_run(operation); break; case OP_DUMP_RUN: rv = do_dump(operation); break; } if (rv < 0) { fprintf(stderr, "failed: %s\n", strerror(-rv)); return EXIT_FAILURE; } return EXIT_SUCCESS; } dlm-4.3.0/fence/000077500000000000000000000000001461150666200133615ustar00rootroot00000000000000dlm-4.3.0/fence/Makefile000066400000000000000000000026731461150666200150310ustar00rootroot00000000000000DESTDIR= PREFIX=/usr BINDIR=$(PREFIX)/sbin MANDIR=$(PREFIX)/share/man BIN_TARGET = dlm_stonith MAN_TARGET = dlm_stonith.8 BIN_SOURCE = stonith_helper.c CFLAGS += -D_GNU_SOURCE -O2 -ggdb \ -Wall -Wformat -Wformat-security -Wmissing-prototypes -Wnested-externs \ -Wpointer-arith -Wextra -Wshadow -Wcast-align -Wwrite-strings \ -Waggregate-return -Wstrict-prototypes -Winline -Wredundant-decls \ -Wno-sign-compare -Wno-unused-parameter -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions -fasynchronous-unwind-tables -fdiagnostics-show-option \ -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong \ -fstack-clash-protection CFLAGS += -fPIE -DPIE CFLAGS += -I../include PKG_CONFIG ?= pkg-config PKG_CONFIG_FLAGS := --errors-to-stdout PACEMAKER_CFLAGS := $(shell $(PKG_CONFIG) $(PKG_CONFIG_FLAGS) --cflags pacemaker-fencing) PACEMAKER_CFLAGS_STATUS := $(.SHELLSTATUS) LDFLAGS += -Wl,-z,relro -Wl,-z,now -pie LDFLAGS += -ldl all: $(BIN_TARGET) $(BIN_TARGET): $(BIN_SOURCE) ifneq ($(PACEMAKER_CFLAGS_STATUS),0) $(error "Failed to call pkg-config for pacemaker cflags: $(PACEMAKER_CFLAGS)") endif $(CC) $(BIN_SOURCE) $(CFLAGS) $(PACEMAKER_CFLAGS) $(LDFLAGS) -o $@ -L. clean: rm -f *.o *.so *.so.* $(BIN_TARGET) INSTALL=$(shell which install) .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -c -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ dlm-4.3.0/fence/dlm_stonith.8000066400000000000000000000015451461150666200160030ustar00rootroot00000000000000.TH DLM_STONITH 8 2013-08-01 dlm dlm .SH NAME dlm_stonith \- a proxy for fencing via stonith/pacemaker .SH SYNOPSIS .B dlm_stonith .SH DESCRIPTION dlm_controld can use dlm_stonith as a proxy fence agent when the stonith/pacemaker system performs actual fencing. dlm_stonith is run by dlm_controld, and is not meant to be run manually. dlm_controld provides the options on stdin as key=val; command line options are for testing. If fencing was successful, this program has a zero exit code. If fencing failed, this program has a non-zero exit code. .SH OPTIONS .BI \-n " nodeid" The nodeid of the target node. (stdin key is "node") .BI \-t " fail_time" The time(2) at which the target node failed. If this option is provided, and the last fencing time is later, then a new fencing request is not made. .SH SEE ALSO .BR dlm_controld (8), .BR dlm.conf (5) dlm-4.3.0/fence/stonith_helper.c000066400000000000000000000032161461150666200165560ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include int nodeid; uint64_t fail_time; #define MAX_ARG_LEN 1024 static int get_options(int argc, char *argv[]) { char arg[MAX_ARG_LEN]; char key[MAX_ARG_LEN]; char val[MAX_ARG_LEN]; int rv, c; if (argc > 1) { while ((c = getopt(argc, argv, "n:t:")) != -1) { switch (c) { case 'n': nodeid = atoi(optarg); break; case 't': fail_time = strtoull(optarg, NULL, 0); break; } } } else { while (fgets(arg, sizeof(arg), stdin)) { rv = sscanf(arg, "%[^=]=%s\n", key, val); if (rv != 2) continue; if (!strcmp(key, "node")) nodeid = atoi(val); else if (!strcmp(key, "fail_time")) fail_time = strtoull(val, NULL, 0); } } if (!nodeid) { fprintf(stderr, "no node\n"); return -1; } return 0; } int main(int argc, char *argv[]) { uint64_t t; int rv; rv = get_options(argc, argv); if (rv) return rv; if (fail_time) { t = stonith_api_time_helper(nodeid, 0); if (t >= fail_time) return 0; } rv = stonith_api_kick_helper(nodeid, 300, 0); if (rv) { fprintf(stderr, "kick_helper error %d nodeid %d\n", rv, nodeid); openlog("dlm_stonith", LOG_CONS | LOG_PID, LOG_DAEMON); syslog(LOG_ERR, "kick_helper error %d nodeid %d\n", rv, nodeid); return rv; } return 0; } dlm-4.3.0/include/000077500000000000000000000000001461150666200137245ustar00rootroot00000000000000dlm-4.3.0/include/copyright.cf000066400000000000000000000002361461150666200162470ustar00rootroot00000000000000#ifndef __COPYRIGHT_DOT_CF__ #define __COPYRIGHT_DOT_CF__ #define REDHAT_COPYRIGHT "Copyright Red Hat, Inc. 2004-2013" #endif /* __COPYRIGHT_DOT_CF__ */ dlm-4.3.0/include/version.cf000066400000000000000000000001431461150666200157210ustar00rootroot00000000000000#ifndef _RELEASE_VERSION_CF_ #define _RELEASE_VERSION_CF_ #define RELEASE_VERSION "4.3.0" #endif dlm-4.3.0/init/000077500000000000000000000000001461150666200132445ustar00rootroot00000000000000dlm-4.3.0/init/dlm.init000066400000000000000000000033601461150666200147070ustar00rootroot00000000000000#!/bin/sh # # dlm_controld # # chkconfig: 21 79 # description: starts and stops dlm_controld # ### BEGIN INIT INFO # Provides: dlm_controld # Required-Start: $network $time $syslog corosync # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops dlm_controld # Description: starts and stops dlm_controld ### END INIT INFO . /etc/rc.d/init.d/functions prog="dlm_controld" progdir="cluster" lockfile="/var/run/$progdir/$prog.pid" exec="/usr/sbin/$prog" [ -f /etc/sysconfig/dlm ] && . /etc/sysconfig/dlm setup() { modprobe dlm > /dev/null 2>&1 mount -t configfs none /sys/kernel/config > /dev/null 2>&1 } start() { breakpoint="$1" [ -x $exec ] || exit 5 if [ ! -d /var/run/$progdir ]; then mkdir -p /var/run/$progdir [ -x /usr/sbin/restorecon ] && restorecon /var/run/$progdir fi setup [ "$breakpoint" = "setup" ] && exit 0 echo -n $"Starting $prog: " daemon $prog $DLM_CONTROLD_OPTS retval=$? echo [ $retval -eq 0 ] return $retval } stop() { echo -n $"Stopping $prog: " killproc -p $lockfile $prog -TERM retval=$? echo [ $retval -eq 0 ] } wait_for_stop() { while [ -e $lockfile ]; do sleep .5 done } restart() { stop wait_for_stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 "$2" ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? dlm-4.3.0/init/dlm.service000066400000000000000000000011121461150666200153750ustar00rootroot00000000000000[Unit] Description=dlm control daemon Requires=corosync.service sys-kernel-config.mount After=corosync.service sys-kernel-config.mount Documentation=man:dlm_controld man:dlm.conf man:dlm_stonith [Service] OOMScoreAdjust=-1000 Type=notify NotifyAccess=main EnvironmentFile=/etc/sysconfig/dlm ExecStartPre=/sbin/modprobe dlm ExecStart=/usr/sbin/dlm_controld --foreground $DLM_CONTROLD_OPTS #ExecStopPost=/sbin/modprobe -r dlm # If dlm_controld doesn't stop, there are active lockspaces. # Killing it will just get the node fenced. SendSIGKILL=no [Install] WantedBy=multi-user.target dlm-4.3.0/init/dlm.sysconfig000066400000000000000000000001621461150666200157450ustar00rootroot00000000000000# DLM_CONTROLD_OPTS -- set command line options for dlm_controld # See dlm_controld man page for list of options. dlm-4.3.0/libdlm/000077500000000000000000000000001461150666200135445ustar00rootroot00000000000000dlm-4.3.0/libdlm/51-dlm.rules000066400000000000000000000003551461150666200156220ustar00rootroot00000000000000KERNEL=="dlm-control", MODE="0666", SYMLINK+="misc/dlm-control" KERNEL=="dlm-monitor", MODE="0666", SYMLINK+="misc/dlm-monitor" KERNEL=="dlm_plock", MODE="0666", SYMLINK+="misc/dlm_plock" KERNEL=="dlm_*", MODE="0660", SYMLINK+="misc/%k" dlm-4.3.0/libdlm/Makefile000066400000000000000000000065001461150666200152050ustar00rootroot00000000000000DESTDIR= PREFIX=/usr LIBNUM=/lib64 LIBDIR=$(PREFIX)/$(LIBNUM) HDRDIR=$(PREFIX)/include MANDIR=$(PREFIX)/share/man PKGDIR=$(LIBDIR)/pkgconfig UDEVDIR=/usr/lib/udev/rules.d LIB_NAME = libdlm LIB_MAJOR = 3 LIB_MINOR = 0 LIB_O = $(LIB_NAME).o LIB_SO = $(LIB_NAME).so LIB_SMAJOR = $(LIB_SO).$(LIB_MAJOR) LIB_TARGET = $(LIB_SO).$(LIB_MAJOR).$(LIB_MINOR) LIB_PC = $(LIB_NAME).pc LIB_PCIN = $(LIB_NAME).pc.in LLT_NAME = libdlm_lt LLT_MAJOR = 3 LLT_MINOR = 0 LLT_O = $(LLT_NAME).o LLT_SO = $(LLT_NAME).so LLT_SMAJOR = $(LLT_SO).$(LLT_MAJOR) LLT_TARGET = $(LLT_SO).$(LLT_MAJOR).$(LLT_MINOR) LLT_PC = $(LLT_NAME).pc LLT_PCIN = $(LLT_NAME).pc.in HDR_TARGET = libdlm.h MAN_TARGET = \ man/dlm_cleanup.3 \ man/dlm_close_lockspace.3 \ man/dlm_create_lockspace.3 \ man/dlm_dispatch.3 \ man/dlm_get_fd.3 \ man/dlm_lock.3 \ man/dlm_lock_wait.3 \ man/dlm_ls_lock.3 \ man/dlm_ls_lock_wait.3 \ man/dlm_ls_lockx.3 \ man/dlm_ls_pthread_init.3 \ man/dlm_ls_unlock.3 \ man/dlm_ls_unlock_wait.3 \ man/dlm_new_lockspace.3 \ man/dlm_open_lockspace.3 \ man/dlm_pthread_init.3 \ man/dlm_release_lockspace.3 \ man/dlm_unlock.3 \ man/dlm_unlock_wait.3 \ man/libdlm.3 UDEV_TARGET = 51-dlm.rules SOURCE = libdlm.c CFLAGS += -D_GNU_SOURCE -O2 -ggdb \ -Wall \ -Wformat \ -Wformat-security \ -Wmissing-prototypes \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wno-unused-parameter \ -Wp,-D_FORTIFY_SOURCE=2 \ -Wp,-D_GLIBCXX_ASSERTIONS \ -fstack-protector-strong \ -fstack-clash-protection \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option \ -fPIC LIB_CFLAGS += $(CFLAGS) -D_REENTRANT LLT_CFLAGS += $(CFLAGS) LIB_LDFLAGS += $(LDFLAGS) -lpthread -Wl,-z,now LLT_LDFLAGS += $(LDFLAGS) -Wl,-z,now all: $(LIB_TARGET) $(LLT_TARGET) $(LIB_PC) $(LLT_PC) $(LIB_O): $(SOURCE) $(CC) $< $(LIB_CFLAGS) -c -o $@ $(LLT_O): $(SOURCE) $(CC) $< $(LLT_CFLAGS) -c -o $@ $(LIB_TARGET): $(LIB_O) $(CC) $^ $(LIB_LDFLAGS) -shared -o $@ -Wl,-soname=$(LIB_SMAJOR) ln -sf $(LIB_TARGET) $(LIB_SO) ln -sf $(LIB_TARGET) $(LIB_SMAJOR) $(LLT_TARGET): $(LLT_O) $(CC) $^ $(LLT_LDFLAGS) -shared -o $@ -Wl,-soname=$(LLT_SMAJOR) ln -sf $(LLT_TARGET) $(LLT_SO) ln -sf $(LLT_TARGET) $(LLT_SMAJOR) $(LIB_PC): $(LIB_PCIN) cat $(LIB_PCIN) | sed -e 's#@PREFIX@#$(PREFIX)#g;s#@LIBDIR@#$(LIBDIR)#g' > $@ $(LLT_PC): $(LLT_PCIN) cat $(LLT_PCIN) | sed -e 's#@PREFIX@#$(PREFIX)#g;s#@LIBDIR@#$(LIBDIR)#g' > $@ clean: rm -f *.o *.so *.so.* *.a *.pc INSTALL=$(shell which install) .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(LIBDIR) $(INSTALL) -d $(DESTDIR)/$(HDRDIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man3 $(INSTALL) -d $(DESTDIR)/$(PKGDIR) $(INSTALL) -d $(DESTDIR)/$(UDEVDIR) $(INSTALL) -c -m 755 $(LIB_TARGET) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 755 $(LLT_TARGET) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_SO) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_SMAJOR) $(DESTDIR)/$(LIBDIR) cp -a $(LLT_SO) $(DESTDIR)/$(LIBDIR) cp -a $(LLT_SMAJOR) $(DESTDIR)/$(LIBDIR) $(INSTALL) -m 644 $(LIB_PC) $(DESTDIR)/$(PKGDIR) $(INSTALL) -m 644 $(LLT_PC) $(DESTDIR)/$(PKGDIR) $(INSTALL) -c -m 644 $(HDR_TARGET) $(DESTDIR)/$(HDRDIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man3/ $(INSTALL) -m 644 $(UDEV_TARGET) $(DESTDIR)/$(UDEVDIR) dlm-4.3.0/libdlm/libdlm.c000066400000000000000000000766241461150666200151720ustar00rootroot00000000000000/* * Copyright 2004-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifdef _REENTRANT #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_SELINUX #include #endif #include #include #define BUILDING_LIBDLM #include "libdlm.h" #include #define MISC_PREFIX "/dev/misc/" #define DLM_PREFIX "dlm_" #define DLM_MISC_PREFIX MISC_PREFIX DLM_PREFIX #define DLM_CONTROL_NAME "dlm-control" #define DLM_CONTROL_PATH MISC_PREFIX DLM_CONTROL_NAME #define DEFAULT_LOCKSPACE "default" /* * V5 of the dlm_device.h kernel/user interface structs */ struct dlm_lock_params_v5 { __u8 mode; __u8 namelen; __u16 flags; __u32 lkid; __u32 parent; void *castparam; void *castaddr; void *bastparam; void *bastaddr; struct dlm_lksb *lksb; char lvb[DLM_USER_LVB_LEN]; char name[0]; }; struct dlm_write_request_v5 { __u32 version[3]; __u8 cmd; __u8 is64bit; __u8 unused[2]; union { struct dlm_lock_params_v5 lock; struct dlm_lspace_params lspace; } i; }; struct dlm_lock_result_v5 { __u32 length; void *user_astaddr; void *user_astparam; struct dlm_lksb *user_lksb; struct dlm_lksb lksb; __u8 bast_mode; __u8 unused[3]; /* Offsets may be zero if no data is present */ __u32 lvb_offset; }; /* * One of these per lockspace in use by the application */ struct dlm_ls_info { int fd; #ifdef _REENTRANT pthread_t tid; #else int tid; #endif }; /* * The default lockspace. * I've resisted putting locking around this as the user should be * "sensible" and only do lockspace operations either in the * main thread or ... carefully... */ static struct dlm_ls_info *default_ls = NULL; static int control_fd = -1; static struct dlm_device_version kernel_version; static int kernel_version_detected = 0; static int release_lockspace(uint32_t minor, uint32_t flags); static void ls_dev_name(const char *lsname, char *devname, int devlen) { snprintf(devname, devlen, DLM_MISC_PREFIX "%s", lsname); } static void dummy_ast_routine(void *arg) { } #ifdef _REENTRANT /* Used for the synchronous and "simplified, synchronous" API routines */ struct lock_wait { pthread_cond_t cond; pthread_mutex_t mutex; struct dlm_lksb lksb; }; static void sync_ast_routine(void *arg) { struct lock_wait *lwait = arg; pthread_mutex_lock(&lwait->mutex); pthread_cond_signal(&lwait->cond); pthread_mutex_unlock(&lwait->mutex); } /* lock_resource & unlock_resource * are the simplified, synchronous API. * Aways uses the default lockspace. */ int lock_resource(const char *resource, int mode, int flags, int *lockid) { int status; struct lock_wait lwait; if (default_ls == NULL) { if (dlm_pthread_init()) { return -1; } } if (!lockid) { errno = EINVAL; return -1; } /* Conversions need the lockid in the LKSB */ if (flags & LKF_CONVERT) lwait.lksb.sb_lkid = *lockid; pthread_cond_init(&lwait.cond, NULL); pthread_mutex_init(&lwait.mutex, NULL); pthread_mutex_lock(&lwait.mutex); status = dlm_lock(mode, &lwait.lksb, flags, resource, strlen(resource), 0, sync_ast_routine, &lwait, NULL, NULL); if (status) return status; /* Wait for it to complete */ pthread_cond_wait(&lwait.cond, &lwait.mutex); pthread_mutex_unlock(&lwait.mutex); *lockid = lwait.lksb.sb_lkid; errno = lwait.lksb.sb_status; if (lwait.lksb.sb_status) return -1; else return 0; } int unlock_resource(int lockid) { int status; struct lock_wait lwait; if (default_ls == NULL) { errno = -ENOTCONN; return -1; } pthread_cond_init(&lwait.cond, NULL); pthread_mutex_init(&lwait.mutex, NULL); pthread_mutex_lock(&lwait.mutex); status = dlm_unlock(lockid, 0, &lwait.lksb, &lwait); if (status) return status; /* Wait for it to complete */ pthread_cond_wait(&lwait.cond, &lwait.mutex); pthread_mutex_unlock(&lwait.mutex); errno = lwait.lksb.sb_status; if (lwait.lksb.sb_status != DLM_EUNLOCK) return -1; else return 0; } /* Tidy up threads after a lockspace is closed */ static int ls_pthread_cleanup(struct dlm_ls_info *lsinfo) { int status = 0; int fd; /* Must close the fd after the thread has finished */ fd = lsinfo->fd; if (lsinfo->tid) { status = pthread_cancel(lsinfo->tid); if (!status) pthread_join(lsinfo->tid, NULL); } if (!status) { free(lsinfo); close(fd); } return status; } /* Cleanup default lockspace */ int dlm_pthread_cleanup(void) { struct dlm_ls_info *lsinfo = default_ls; /* Protect users from their own stupidity */ if (!lsinfo) return 0; default_ls = NULL; return ls_pthread_cleanup(lsinfo); } #else /* Non-pthread version of cleanup */ static int ls_pthread_cleanup(struct dlm_ls_info *lsinfo) { close(lsinfo->fd); free(lsinfo); return 0; } #endif static void set_version_v5(struct dlm_write_request_v5 *req) { req->version[0] = kernel_version.version[0]; req->version[1] = kernel_version.version[1]; req->version[2] = kernel_version.version[2]; if (sizeof(long) == sizeof(long long)) req->is64bit = 1; else req->is64bit = 0; } static void set_version_v6(struct dlm_write_request *req) { req->version[0] = kernel_version.version[0]; req->version[1] = kernel_version.version[1]; req->version[2] = kernel_version.version[2]; if (sizeof(long) == sizeof(long long)) req->is64bit = 1; else req->is64bit = 0; } static int open_default_lockspace(void) { if (!default_ls) { dlm_lshandle_t ls; /* This isn't the race it looks, create_lockspace will * do the right thing if the lockspace has already been * created. */ ls = dlm_open_lockspace(DEFAULT_LOCKSPACE); if (!ls) ls = dlm_create_lockspace(DEFAULT_LOCKSPACE, 0600); if (!ls) return -1; default_ls = (struct dlm_ls_info *)ls; } return 0; } static void detect_kernel_version(void) { struct dlm_device_version v; int rv; rv = read(control_fd, &v, sizeof(struct dlm_device_version)); if (rv < 0) { kernel_version.version[0] = 5; kernel_version.version[1] = 0; kernel_version.version[2] = 0; } else { kernel_version.version[0] = v.version[0]; kernel_version.version[1] = v.version[1]; kernel_version.version[2] = v.version[2]; } kernel_version_detected = 1; } static int find_control_minor(int *minor) { FILE *f; char name[256]; int found = 0, m = 0; f = fopen("/proc/misc", "r"); if (!f) return -1; while (!feof(f)) { if (fscanf(f, "%d %s", &m, name) != 2) continue; if (strcmp(name, DLM_CONTROL_NAME)) continue; found = 1; break; } fclose(f); if (found) { *minor = m; return 0; } return -1; } static int open_control_device(void) { struct stat st; int i, rv, minor, found = 0; if (control_fd > -1) goto out; rv = find_control_minor(&minor); if (rv < 0) return -1; /* wait for udev to create the device */ for (i = 0; i < 10; i++) { if (stat(DLM_CONTROL_PATH, &st) == 0 && minor(st.st_rdev) == minor) { found = 1; break; } sleep(1); continue; } if (!found) return -1; control_fd = open(DLM_CONTROL_PATH, O_RDWR); if (control_fd == -1) return -1; out: fcntl(control_fd, F_SETFD, 1); if (!kernel_version_detected) detect_kernel_version(); return 0; } /* the max number of characters in a sysfs device name, not including \0 */ #define MAX_SYSFS_NAME 19 static int find_udev_device(const char *lockspace, int minor, char *udev_path) { char bname[PATH_MAX]; char tmp_path[PATH_MAX]; DIR *d; struct dirent *de; struct stat st; size_t basename_len; int i; ls_dev_name(lockspace, udev_path, PATH_MAX); snprintf(bname, PATH_MAX, DLM_PREFIX "%s", lockspace); basename_len = strlen(bname); for (i = 0; i < 10; i++) { /* look for a device with the full name */ if (stat(udev_path, &st) == 0 && minor(st.st_rdev) == minor) return 0; if (basename_len < MAX_SYSFS_NAME) { sleep(1); continue; } /* look for a device with a truncated name */ d = opendir(MISC_PREFIX); while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; if (strlen(de->d_name) < MAX_SYSFS_NAME) continue; if (strncmp(de->d_name, bname, MAX_SYSFS_NAME)) continue; snprintf(tmp_path, PATH_MAX, MISC_PREFIX "%s", de->d_name); if (stat(tmp_path, &st)) continue; if (minor(st.st_rdev) != minor) continue; /* truncated name */ strncpy(udev_path, tmp_path, PATH_MAX); closedir(d); return 0; } closedir(d); sleep(1); } return -1; } /* * do_dlm_dispatch() * Read an ast from the kernel. */ static int do_dlm_dispatch_v5(int fd) { char resultbuf[sizeof(struct dlm_lock_result_v5) + DLM_USER_LVB_LEN]; struct dlm_lock_result_v5 *result = (struct dlm_lock_result_v5 *)resultbuf; char *fullresult = NULL; int status; void (*astaddr)(void *astarg); status = read(fd, result, sizeof(resultbuf)); if (status <= 0) return -1; /* This shouldn't happen any more, can probably be removed */ if (result->length != status) { int newstat; fullresult = malloc(result->length); if (!fullresult) return -1; newstat = read(fd, (struct dlm_lock_result_v5 *)fullresult, result->length); /* If it read OK then use the new data. otherwise we can still deliver the AST, it just might not have all the info in it...hmmm */ if (newstat == result->length) result = (struct dlm_lock_result_v5 *)fullresult; } else { fullresult = resultbuf; } /* Copy lksb to user's buffer - except the LVB ptr */ memcpy(result->user_lksb, &result->lksb, sizeof(struct dlm_lksb) - sizeof(char*)); /* Flip the status. Kernel space likes negative return codes, userspace positive ones */ result->user_lksb->sb_status = -result->user_lksb->sb_status; /* Copy optional items */ if (result->lvb_offset) memcpy(result->user_lksb->sb_lvbptr, fullresult + result->lvb_offset, DLM_LVB_LEN); /* Call AST */ if (result->user_astaddr) { astaddr = result->user_astaddr; astaddr(result->user_astparam); } if (fullresult != resultbuf) free(fullresult); return 0; } static int do_dlm_dispatch_v6(int fd) { char resultbuf[sizeof(struct dlm_lock_result) + DLM_USER_LVB_LEN]; struct dlm_lock_result *result = (struct dlm_lock_result *)resultbuf; int status; void (*astaddr)(void *astarg); status = read(fd, result, sizeof(resultbuf)); if (status <= 0) return -1; /* Copy lksb to user's buffer - except the LVB ptr */ memcpy(result->user_lksb, &result->lksb, sizeof(struct dlm_lksb) - sizeof(char*)); /* Copy lvb to user's buffer */ if (result->lvb_offset) memcpy(result->user_lksb->sb_lvbptr, (char *)result + result->lvb_offset, DLM_LVB_LEN); result->user_lksb->sb_status = -result->user_lksb->sb_status; if (result->user_astaddr) { astaddr = result->user_astaddr; astaddr(result->user_astparam); } return 0; } static int do_dlm_dispatch(int fd) { if (kernel_version.version[0] == 5) return do_dlm_dispatch_v5(fd); else return do_dlm_dispatch_v6(fd); } /* * sync_write() * Helper routine which supports the synchronous DLM calls. This * writes a parameter block down to the DLM and waits for the * operation to complete. This hides the different completion mechanism * used when called from the main thread or the DLM 'AST' thread. */ #ifdef _REENTRANT static int sync_write_v5(struct dlm_ls_info *lsinfo, struct dlm_write_request_v5 *req, int len) { struct lock_wait lwait; int status; if (pthread_self() == lsinfo->tid) { /* This is the DLM worker thread, don't use lwait to sync */ req->i.lock.castaddr = dummy_ast_routine; req->i.lock.castparam = NULL; status = write(lsinfo->fd, req, len); if (status < 0) return -1; while (req->i.lock.lksb->sb_status == EINPROG) { do_dlm_dispatch_v5(lsinfo->fd); } } else { pthread_cond_init(&lwait.cond, NULL); pthread_mutex_init(&lwait.mutex, NULL); pthread_mutex_lock(&lwait.mutex); req->i.lock.castaddr = sync_ast_routine; req->i.lock.castparam = &lwait; status = write(lsinfo->fd, req, len); if (status < 0) return -1; pthread_cond_wait(&lwait.cond, &lwait.mutex); pthread_mutex_unlock(&lwait.mutex); } return status; /* lock status is in the lksb */ } static int sync_write_v6(struct dlm_ls_info *lsinfo, struct dlm_write_request *req, int len) { struct lock_wait lwait; int status; if (pthread_self() == lsinfo->tid) { /* This is the DLM worker thread, don't use lwait to sync */ req->i.lock.castaddr = dummy_ast_routine; req->i.lock.castparam = NULL; status = write(lsinfo->fd, req, len); if (status < 0) return -1; while (req->i.lock.lksb->sb_status == EINPROG) { do_dlm_dispatch_v6(lsinfo->fd); } } else { pthread_cond_init(&lwait.cond, NULL); pthread_mutex_init(&lwait.mutex, NULL); pthread_mutex_lock(&lwait.mutex); req->i.lock.castaddr = sync_ast_routine; req->i.lock.castparam = &lwait; status = write(lsinfo->fd, req, len); if (status < 0) return -1; pthread_cond_wait(&lwait.cond, &lwait.mutex); pthread_mutex_unlock(&lwait.mutex); } return status; /* lock status is in the lksb */ } #else /* _REENTRANT */ static int sync_write_v5(struct dlm_ls_info *lsinfo, struct dlm_write_request_v5 *req, int len) { int status; req->i.lock.castaddr = dummy_ast_routine; req->i.lock.castparam = NULL; status = write(lsinfo->fd, req, len); if (status < 0) return -1; while (req->i.lock.lksb->sb_status == EINPROG) { do_dlm_dispatch_v5(lsinfo->fd); } errno = req->i.lock.lksb->sb_status; if (errno && errno != EUNLOCK) return -1; return 0; } static int sync_write_v6(struct dlm_ls_info *lsinfo, struct dlm_write_request *req, int len) { int status; req->i.lock.castaddr = dummy_ast_routine; req->i.lock.castparam = NULL; status = write(lsinfo->fd, req, len); if (status < 0) return -1; while (req->i.lock.lksb->sb_status == EINPROG) { do_dlm_dispatch_v6(lsinfo->fd); } errno = req->i.lock.lksb->sb_status; if (errno && errno != EUNLOCK) return -1; return 0; } #endif /* _REENTRANT */ /* * Lock * All the ways to request/convert a lock */ static int ls_lock_v5(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg)) { char parambuf[sizeof(struct dlm_write_request_v5) + DLM_RESNAME_MAXLEN]; struct dlm_write_request_v5 *req = (struct dlm_write_request_v5 *)parambuf; struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; int status; int len; memset(req, 0, sizeof(*req)); set_version_v5(req); req->cmd = DLM_USER_LOCK; req->i.lock.mode = mode; req->i.lock.flags = (flags & ~LKF_WAIT); req->i.lock.lkid = lksb->sb_lkid; req->i.lock.parent = parent; req->i.lock.lksb = lksb; req->i.lock.castaddr = astaddr; req->i.lock.bastaddr = bastaddr; req->i.lock.castparam = astarg; /* same comp and blocking ast arg */ req->i.lock.bastparam = astarg; if (flags & LKF_CONVERT) { req->i.lock.namelen = 0; } else { if (namelen > DLM_RESNAME_MAXLEN) { errno = EINVAL; return -1; } req->i.lock.namelen = namelen; memcpy(req->i.lock.name, name, namelen); } if (flags & LKF_VALBLK) { memcpy(req->i.lock.lvb, lksb->sb_lvbptr, DLM_LVB_LEN); } len = sizeof(struct dlm_write_request_v5) + namelen; lksb->sb_status = EINPROG; if (flags & LKF_WAIT) status = sync_write_v5(lsinfo, req, len); else status = write(lsinfo->fd, req, len); if (status < 0) return -1; /* * the lock id is the return value from the write on the device */ if (status > 0) lksb->sb_lkid = status; return 0; } static int ls_lock_v6(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), uint64_t *xid, uint64_t *timeout) { char parambuf[sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN]; struct dlm_write_request *req = (struct dlm_write_request *)parambuf; struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; int status; int len; memset(req, 0, sizeof(*req)); set_version_v6(req); req->cmd = DLM_USER_LOCK; req->i.lock.mode = mode; req->i.lock.flags = (flags & ~LKF_WAIT); req->i.lock.lkid = lksb->sb_lkid; req->i.lock.parent = parent; req->i.lock.lksb = lksb; req->i.lock.castaddr = astaddr; req->i.lock.bastaddr = bastaddr; req->i.lock.castparam = astarg; /* same comp and blocking ast arg */ req->i.lock.bastparam = astarg; if (xid) req->i.lock.xid = *xid; if (flags & LKF_CONVERT) { req->i.lock.namelen = 0; } else { if (namelen > DLM_RESNAME_MAXLEN) { errno = EINVAL; return -1; } req->i.lock.namelen = namelen; memcpy(req->i.lock.name, name, namelen); } if (flags & LKF_VALBLK) { memcpy(req->i.lock.lvb, lksb->sb_lvbptr, DLM_LVB_LEN); } len = sizeof(struct dlm_write_request) + namelen; lksb->sb_status = EINPROG; if (flags & LKF_WAIT) status = sync_write_v6(lsinfo, req, len); else status = write(lsinfo->fd, req, len); if (status < 0) return -1; /* * the lock id is the return value from the write on the device */ if (status > 0) lksb->sb_lkid = status; return 0; } static int ls_lock(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range) { /* no support for range locks */ if (range) { errno = ENOSYS; return -1; } if (flags & LKF_VALBLK && !lksb->sb_lvbptr) { errno = EINVAL; return -1; } if (kernel_version.version[0] == 5) return ls_lock_v5(ls, mode, lksb, flags, name, namelen, parent, astaddr, astarg, bastaddr); else return ls_lock_v6(ls, mode, lksb, flags, name, namelen, parent, astaddr, astarg, bastaddr, NULL, NULL); } /* * Extended async locking in own lockspace */ int dlm_ls_lockx(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), uint64_t *xid, uint64_t *timeout) { if (kernel_version.version[0] < 6) { errno = ENOSYS; return -1; } return ls_lock_v6(ls, mode, lksb, flags, name, namelen, parent, astaddr, astarg, bastaddr, xid, timeout); } /* * Async locking in own lockspace */ int dlm_ls_lock(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range) { return ls_lock(ls, mode, lksb, flags, name, namelen, parent, astaddr, astarg, bastaddr, range); } /* * Sync locking in own lockspace */ int dlm_ls_lock_wait(dlm_lshandle_t ls, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void *bastarg, void (*bastaddr) (void *bastarg), void *range) { return ls_lock(ls, mode, lksb, flags | LKF_WAIT, name, namelen, parent, NULL, bastarg, bastaddr, range); } /* * Async locking in the default lockspace */ int dlm_lock(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range) { if (open_default_lockspace()) return -1; return ls_lock(default_ls, mode, lksb, flags, name, namelen, parent, astaddr, astarg, bastaddr, range); } /* * Sync locking in the default lockspace */ int dlm_lock_wait(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, void *bastarg, void (*bastaddr) (void *bastarg), void *range) { if (open_default_lockspace()) return -1; return ls_lock(default_ls, mode, lksb, flags | LKF_WAIT, name, namelen, parent, NULL, bastarg, bastaddr, range); } /* * Unlock * All the ways to unlock/cancel a lock */ static int ls_unlock_v5(struct dlm_ls_info *lsinfo, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg) { struct dlm_write_request_v5 req; set_version_v5(&req); req.cmd = DLM_USER_UNLOCK; req.i.lock.lkid = lkid; req.i.lock.flags = (flags & ~LKF_WAIT); req.i.lock.lksb = lksb; req.i.lock.castparam = astarg; /* DLM_USER_UNLOCK will default to existing completion AST */ req.i.lock.castaddr = 0; lksb->sb_status = EINPROG; if (flags & LKF_WAIT) return sync_write_v5(lsinfo, &req, sizeof(req)); else return write(lsinfo->fd, &req, sizeof(req)); } static int ls_unlock_v6(struct dlm_ls_info *lsinfo, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg) { struct dlm_write_request req; set_version_v6(&req); req.cmd = DLM_USER_UNLOCK; req.i.lock.lkid = lkid; req.i.lock.flags = (flags & ~LKF_WAIT); req.i.lock.lksb = lksb; req.i.lock.namelen = 0; req.i.lock.castparam = astarg; /* DLM_USER_UNLOCK will default to existing completion AST */ req.i.lock.castaddr = 0; lksb->sb_status = EINPROG; if (flags & LKF_WAIT) return sync_write_v6(lsinfo, &req, sizeof(req)); else return write(lsinfo->fd, &req, sizeof(req)); } int dlm_ls_unlock(dlm_lshandle_t ls, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg) { struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; int status; if (ls == NULL) { errno = ENOTCONN; return -1; } if (!lkid) { errno = EINVAL; return -1; } if (kernel_version.version[0] == 5) status = ls_unlock_v5(lsinfo, lkid, flags, lksb, astarg); else status = ls_unlock_v6(lsinfo, lkid, flags, lksb, astarg); if (status < 0) return -1; return 0; } int dlm_ls_unlock_wait(dlm_lshandle_t ls, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb) { return dlm_ls_unlock(ls, lkid, flags | LKF_WAIT, lksb, NULL); } int dlm_unlock_wait(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb) { return dlm_ls_unlock_wait(default_ls, lkid, flags | LKF_WAIT, lksb); } int dlm_unlock(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg) { return dlm_ls_unlock(default_ls, lkid, flags, lksb, astarg); } int dlm_ls_deadlock_cancel(dlm_lshandle_t ls, uint32_t lkid, uint32_t flags) { struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; struct dlm_write_request req; if (kernel_version.version[0] < 6) { errno = ENOSYS; return -1; } if (ls == NULL) { errno = ENOTCONN; return -1; } if (!lkid) { errno = EINVAL; return -1; } set_version_v6(&req); req.cmd = DLM_USER_DEADLOCK; req.i.lock.lkid = lkid; req.i.lock.flags = flags; return write(lsinfo->fd, &req, sizeof(req)); } /* * Purge * Clear away orphan locks */ int dlm_ls_purge(dlm_lshandle_t ls, int nodeid, int pid) { struct dlm_write_request req; struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; int status; if (kernel_version.version[0] < 6) { errno = ENOSYS; return -1; } if (ls == NULL) { errno = ENOTCONN; return -1; } set_version_v6(&req); req.cmd = DLM_USER_PURGE; req.i.purge.nodeid = nodeid; req.i.purge.pid = pid; status = write(lsinfo->fd, &req, sizeof(req)); if (status < 0) return -1; return 0; } /* These two routines for for users that want to * do their own fd handling. * This allows a non-threaded app to use the DLM. */ int dlm_get_fd(void) { if (default_ls) { return default_ls->fd; } else { if (open_default_lockspace()) return -1; else return default_ls->fd; } } int dlm_dispatch(int fd) { int status; int fdflags; fdflags = fcntl(fd, F_GETFL, 0); fcntl(fd, F_SETFL, fdflags | O_NONBLOCK); do { status = do_dlm_dispatch(fd); } while (status == 0); /* EAGAIN is not an error */ if (status < 0 && errno == EAGAIN) status = 0; fcntl(fd, F_SETFL, fdflags); return status; } /* Converts a lockspace handle into a file descriptor */ int dlm_ls_get_fd(dlm_lshandle_t lockspace) { struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)lockspace; return lsinfo->fd; } #ifdef _REENTRANT static void *dlm_recv_thread(void *lsinfo) { struct dlm_ls_info *lsi = lsinfo; for (;;) do_dlm_dispatch(lsi->fd); return NULL; } /* Multi-threaded callers normally use this */ int dlm_pthread_init(void) { if (open_default_lockspace()) return -1; if (default_ls->tid) { errno = EEXIST; return -1; } if (pthread_create(&default_ls->tid, NULL, dlm_recv_thread, default_ls)) { int saved_errno = errno; close(default_ls->fd); free(default_ls); default_ls = NULL; errno = saved_errno; return -1; } return 0; } /* And same, for those with their own lockspace */ int dlm_ls_pthread_init(dlm_lshandle_t ls) { struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; if (lsinfo->tid) { errno = EEXIST; return -1; } return pthread_create(&lsinfo->tid, NULL, dlm_recv_thread, (void *)ls); } #endif /* * Lockspace manipulation functions * Privileged users (checked by the kernel) can create/release lockspaces */ static int create_lockspace_v5(const char *name, uint32_t flags) { char reqbuf[sizeof(struct dlm_write_request_v5) + DLM_LOCKSPACE_LEN]; struct dlm_write_request_v5 *req = (struct dlm_write_request_v5 *)reqbuf; int namelen = strlen(name); int minor; memset(reqbuf, 0, sizeof(reqbuf)); set_version_v5(req); req->cmd = DLM_USER_CREATE_LOCKSPACE; req->i.lspace.flags = flags; if (namelen > DLM_LOCKSPACE_LEN) { errno = EINVAL; return -1; } memcpy(req->i.lspace.name, name, namelen); minor = write(control_fd, req, sizeof(*req) + namelen); return minor; } static int create_lockspace_v6(const char *name, uint32_t flags) { char reqbuf[sizeof(struct dlm_write_request) + DLM_LOCKSPACE_LEN]; struct dlm_write_request *req = (struct dlm_write_request *)reqbuf; int namelen = strlen(name); int minor; memset(reqbuf, 0, sizeof(reqbuf)); set_version_v6(req); req->cmd = DLM_USER_CREATE_LOCKSPACE; req->i.lspace.flags = flags; if (namelen > DLM_LOCKSPACE_LEN) { errno = EINVAL; return -1; } memcpy(req->i.lspace.name, name, namelen); minor = write(control_fd, req, sizeof(*req) + namelen); return minor; } static dlm_lshandle_t create_lockspace(const char *name, mode_t mode, uint32_t flags) { char dev_path[PATH_MAX]; char udev_path[PATH_MAX]; struct dlm_ls_info *newls; int error, saved_errno, minor; /* We use the control device for creating lockspaces. */ if (open_control_device()) return NULL; newls = malloc(sizeof(struct dlm_ls_info)); if (!newls) return NULL; ls_dev_name(name, dev_path, sizeof(dev_path)); if (kernel_version.version[0] == 5) minor = create_lockspace_v5(name, flags); else minor = create_lockspace_v6(name, flags); if (minor < 0) goto fail; /* Wait for udev to create the device; the device it creates may have a truncated name due to the sysfs device name limit. */ error = find_udev_device(name, minor, udev_path); if (error) goto fail; /* If the symlink already exists, find_udev_device() will return it and we'll skip this. */ if (strcmp(dev_path, udev_path)) { error = symlink(udev_path, dev_path); if (error) goto fail; } /* Open it and return the struct as a handle */ newls->fd = open(dev_path, O_RDWR); if (newls->fd == -1) goto fail; if (mode) fchmod(newls->fd, mode); newls->tid = 0; fcntl(newls->fd, F_SETFD, 1); return (dlm_lshandle_t)newls; fail: saved_errno = errno; free(newls); errno = saved_errno; return NULL; } dlm_lshandle_t dlm_new_lockspace(const char *name, mode_t mode, uint32_t flags) { flags &= ~DLM_LSFL_TIMEWARN; return create_lockspace(name, mode, flags); } dlm_lshandle_t dlm_create_lockspace(const char *name, mode_t mode) { return create_lockspace(name, mode, 0); } static int release_lockspace_v5(uint32_t minor, uint32_t flags) { struct dlm_write_request_v5 req; set_version_v5(&req); req.cmd = DLM_USER_REMOVE_LOCKSPACE; req.i.lspace.minor = minor; req.i.lspace.flags = flags; return write(control_fd, &req, sizeof(req)); } static int release_lockspace_v6(uint32_t minor, uint32_t flags) { struct dlm_write_request req; set_version_v6(&req); req.cmd = DLM_USER_REMOVE_LOCKSPACE; req.i.lspace.minor = minor; req.i.lspace.flags = flags; return write(control_fd, &req, sizeof(req)); } static int release_lockspace(uint32_t minor, uint32_t flags) { if (kernel_version.version[0] == 5) return release_lockspace_v5(minor, flags); else return release_lockspace_v6(minor, flags); } int dlm_release_lockspace(const char *name, dlm_lshandle_t ls, int force) { char dev_path[PATH_MAX]; struct stat st; struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; uint32_t flags = 0; int fd, is_symlink = 0; ls_dev_name(name, dev_path, sizeof(dev_path)); if (!lstat(dev_path, &st) && S_ISLNK(st.st_mode)) is_symlink = 1; /* We need the minor number */ if (fstat(lsinfo->fd, &st)) return -1; /* Close the lockspace first if it's in use */ ls_pthread_cleanup(lsinfo); if (open_control_device()) return -1; if (force) flags = DLM_USER_LSFLG_FORCEFREE; release_lockspace(minor(st.st_rdev), flags); if (!is_symlink) return 0; /* The following open is used to detect if our release was the last. It will fail if our release was the last, because either: . udev has already removed the truncated sysfs device name (ENOENT) . the misc device has been deregistered in the kernel (ENODEV) (the deregister completes before release returns) So, if the open fails, we know that our release was the last, udev will be removing the device with the truncated name (if it hasn't already), and we should remove the symlink. */ fd = open(dev_path, O_RDWR); if (fd < 0) unlink(dev_path); else close(fd); /* our release was not the last */ return 0; } /* * Normal users just open/close lockspaces */ dlm_lshandle_t dlm_open_lockspace(const char *name) { char dev_name[PATH_MAX]; struct dlm_ls_info *newls; int saved_errno; /* Need to detect kernel version */ if (open_control_device()) return NULL; newls = malloc(sizeof(struct dlm_ls_info)); if (!newls) return NULL; newls->tid = 0; ls_dev_name(name, dev_name, sizeof(dev_name)); newls->fd = open(dev_name, O_RDWR); saved_errno = errno; if (newls->fd == -1) { free(newls); errno = saved_errno; return NULL; } fcntl(newls->fd, F_SETFD, 1); return (dlm_lshandle_t)newls; } int dlm_close_lockspace(dlm_lshandle_t ls) { struct dlm_ls_info *lsinfo = (struct dlm_ls_info *)ls; ls_pthread_cleanup(lsinfo); return 0; } int dlm_kernel_version(uint32_t *major, uint32_t *minor, uint32_t *patch) { if (open_control_device()) return -1; *major = kernel_version.version[0]; *minor = kernel_version.version[1]; *patch = kernel_version.version[2]; return 0; } void dlm_library_version(uint32_t *major, uint32_t *minor, uint32_t *patch) { *major = DLM_DEVICE_VERSION_MAJOR; *minor = DLM_DEVICE_VERSION_MINOR; *patch = DLM_DEVICE_VERSION_PATCH; } dlm-4.3.0/libdlm/libdlm.h000066400000000000000000000167351461150666200151740ustar00rootroot00000000000000/* * Copyright 2004-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __LIBDLM_H #define __LIBDLM_H #include /* * Typedefs for things that are compatible with the kernel but replicated here * so that users only need the libdlm include file. libdlm itself needs the * full kernel file so shouldn't use these. */ #define DLM_LVB_LEN 32 #ifndef BUILDING_LIBDLM /* * These two lengths are copied from linux/dlmconstants.h * They are the max length of a lockspace name and the max length of a * resource name. */ #define DLM_LOCKSPACE_LEN 64 #define DLM_RESNAME_MAXLEN 64 struct dlm_lksb { int sb_status; uint32_t sb_lkid; char sb_flags; char *sb_lvbptr; }; /* lksb flags */ #define DLM_SBF_DEMOTED 0x01 #define DLM_SBF_VALNOTVALID 0x02 #define DLM_SBF_ALTMODE 0x04 /* dlm_new_lockspace flags */ #define DLM_LSFL_TIMEWARN 0x00000002 #define DLM_LSFL_FS 0x00000004 #define DLM_LSFL_NEWEXCL 0x00000008 #endif #if 0 /* Dummy definition to keep linkages */ struct dlm_queryinfo; #endif extern int dlm_kernel_version(uint32_t *maj, uint32_t *min, uint32_t *patch); extern void dlm_library_version(uint32_t *maj, uint32_t *min, uint32_t *patch); /* * Using the default lockspace * * lock_resource() - simple sync request or convert (requires pthreads) * unlock_resource() - simple sync unlock (requires pthreads) * dlm_lock() - async request or convert * dlm_unlock() - async unlock or cancel * dlm_lock_wait() - sync request or convert * dlm_unlock_wait() - sync unlock or cancel */ #ifdef _REENTRANT extern int lock_resource(const char *resource, int mode, int flags, int *lockid); extern int unlock_resource(int lockid); #endif extern int dlm_lock(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unusued */ void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range); /* unused */ extern int dlm_unlock(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg); extern int dlm_lock_wait(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void *bastarg, void (*bastaddr) (void *bastarg), void *range); /* unused */ extern int dlm_unlock_wait(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb); /* * These two are for users that want to do their own FD handling * * dlm_get_fd() - returns fd for the default lockspace for polling and dispatch * dlm_dispatch() - dispatches pending asts and basts */ extern int dlm_get_fd(void); extern int dlm_dispatch(int fd); /* * Creating your own lockspace * * dlm_create_lockspace() - create and open a lockspace and return a handle * to it. Privileges are required to create/release. * dlm_new_lockspace() - same as create but allows flags * dlm_open_lockspace() - simply returns a handle for an existing lockspace and * may be called by ordinary users. * dlm_release_lockspace() * dlm_close_lockspace() * dlm_ls_get_fd() * * NOTE: that if you dlm_create_lockspace() then dlm_open_lockspace() you will * have two open files on the same device. Hardly a major problem but I thought * it worth pointing out. */ typedef void *dlm_lshandle_t; extern dlm_lshandle_t dlm_create_lockspace(const char *name, mode_t mode); extern int dlm_release_lockspace(const char *name, dlm_lshandle_t ls, int force); extern dlm_lshandle_t dlm_open_lockspace(const char *name); extern int dlm_close_lockspace(dlm_lshandle_t ls); extern int dlm_ls_get_fd(dlm_lshandle_t ls); extern dlm_lshandle_t dlm_new_lockspace(const char *name, mode_t mode, uint32_t flags); /* * Using your own lockspace * * dlm_ls_lock() * dlm_ls_lockx() * dlm_ls_unlock() * dlm_ls_lock_wait() * dlm_ls_unlock_wait() * dlm_ls_deadlock_cancel() * dlm_ls_purge() */ extern int dlm_ls_lock(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range); /* unused */ extern int dlm_ls_lockx(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), uint64_t *xid, uint64_t *timeout); extern int dlm_ls_unlock(dlm_lshandle_t lockspace, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg); extern int dlm_ls_lock_wait(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void *bastarg, void (*bastaddr) (void *bastarg), void *range); /* unused */ extern int dlm_ls_unlock_wait(dlm_lshandle_t lockspace, uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb); extern int dlm_ls_deadlock_cancel(dlm_lshandle_t ls, uint32_t lkid, uint32_t flags); extern int dlm_ls_purge(dlm_lshandle_t lockspace, int nodeid, int pid); /* * For threaded applications * * dlm_pthread_init() * dlm_ls_pthread_init() - call this before any locking operations and the ASTs * will be delivered in their own thread. * dlm_pthread_cleanup() - call the cleanup routine at application exit * (optional) or, if the locking functions are in a * shared library that is to be unloaded. * * dlm_close/release_lockspace() will tidy the threads for a non-default * lockspace */ #ifdef _REENTRANT extern int dlm_pthread_init(void); extern int dlm_ls_pthread_init(dlm_lshandle_t lockspace); extern int dlm_pthread_cleanup(void); #endif /* * Lock modes */ #define LKM_NLMODE 0 /* null lock */ #define LKM_CRMODE 1 /* concurrent read */ #define LKM_CWMODE 2 /* concurrent write */ #define LKM_PRMODE 3 /* protected read */ #define LKM_PWMODE 4 /* protected write */ #define LKM_EXMODE 5 /* exclusive */ /* * Locking flags - these match the ones in dlm.h */ #define LKF_NOQUEUE 0x00000001 #define LKF_CANCEL 0x00000002 #define LKF_CONVERT 0x00000004 #define LKF_VALBLK 0x00000008 #define LKF_QUECVT 0x00000010 #define LKF_IVVALBLK 0x00000020 #define LKF_CONVDEADLK 0x00000040 #define LKF_PERSISTENT 0x00000080 #define LKF_NODLCKWT 0x00000100 #define LKF_NODLCKBLK 0x00000200 #define LKF_EXPEDITE 0x00000400 #define LKF_NOQUEUEBAST 0x00000800 #define LKF_HEADQUE 0x00001000 #define LKF_NOORDER 0x00002000 #define LKF_ORPHAN 0x00004000 #define LKF_ALTPR 0x00008000 #define LKF_ALTCW 0x00010000 #define LKF_FORCEUNLOCK 0x00020000 #define LKF_TIMEOUT 0x00040000 #define LKF_WAIT 0x80000000 /* Userspace only, for sync API calls */ /* * Extra return codes used by the DLM */ #define ECANCEL 0x10001 #define EUNLOCK 0x10002 #define EINPROG 0x10003 /* lock operation is in progress */ #endif dlm-4.3.0/libdlm/libdlm.pc.in000066400000000000000000000003071461150666200157400ustar00rootroot00000000000000prefix=@PREFIX@ exec_prefix=${prefix} includedir=${prefix}/include libdir=@LIBDIR@ Name: libdlm Description: The dlm library Version: 4.0.0 Cflags: -I${includedir} Libs: -L${libdir} -ldlm -lpthread dlm-4.3.0/libdlm/libdlm_internal.h000066400000000000000000000007341461150666200170600ustar00rootroot00000000000000/* * Copyright 2004-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ /* Needed before we include the kernel libdlm header */ #define __user typedef uint8_t __u8; typedef uint16_t __u16; typedef uint32_t __u32; #define BUILDING_LIBDLM dlm-4.3.0/libdlm/libdlm_lt.pc.in000066400000000000000000000003201461150666200164320ustar00rootroot00000000000000prefix=@PREFIX@ exec_prefix=${prefix} includedir=${prefix}/include libdir=@LIBDIR@ Name: libdlm_lt Description: The non-threaded dlm library Version: 4.0.0 Cflags: -I${includedir} Libs: -L${libdir} -ldlm_lt dlm-4.3.0/libdlm/man/000077500000000000000000000000001461150666200143175ustar00rootroot00000000000000dlm-4.3.0/libdlm/man/dlm_cleanup.3000066400000000000000000000000221461150666200166600ustar00rootroot00000000000000.so man3/libdlm.3 dlm-4.3.0/libdlm/man/dlm_close_lockspace.3000066400000000000000000000000401461150666200203620ustar00rootroot00000000000000.so man3/dlm_create_lockspace.3 dlm-4.3.0/libdlm/man/dlm_create_lockspace.3000066400000000000000000000110641461150666200205300ustar00rootroot00000000000000.TH DLM_CREATE_LOCKSPACE 3 "July 5, 2007" "libdlm functions" .SH NAME dlm_create_lockspace, dlm_open_lockspace, dlm_close_lockspace, dlm_release_lockspace \- manipulate DLM lockspaces .SH SYNOPSIS .nf #include dlm_lshandle_t dlm_create_lockspace(const char *name, mode_t mode); dlm_lshandle_t dlm_new_lockspace(const char *name, mode_t mode, uint32_t flags); dlm_lshandle_t dlm_open_lockspace(const char *name); int dlm_close_lockspace(dlm_lshandle_t ls); int dlm_release_lockspace(const char *name, dlm_lshandle_t ls, int force); .fi .SH DESCRIPTION The DLM allows locks to be partitioned into "lockspaces", and these can be manipulated by userspace calls. It is possible (though not recommended) for an application to have multiple lockspaces open at one time. Many of the DLM calls work on the "default" lockspace, which should be fine for most users. The calls with _ls_ in them allow you to isolate your application from all others running in the cluster. Remember, lockspaces are a cluster-wide resource, so if you create a lockspace called "myls" it will share locks with a lockspace called "myls" on all nodes. These calls allow users to create & remove lockspaces, and users to connect to existing lockspace to store their locks there. .PP .SS dlm_lshandle_t dlm_create_lockspace(const char *name, mode_t mode); .br This creates a lockspace called and the mode of the file user to access it will be (subject to umask as usual). The lockspace must not already exist on this node, if it does -1 will be returned and errno will be set to EEXIST. If you really want to use this lockspace you can then use dlm_open_lockspace() below. The name is the name of a misc device that will be created in /dev/misc. .br On success a handle to the lockspace is returned, which can be used to pass into subsequent dlm_ls_lock/unlock calls. Make no assumptions as to the content of this handle as it's content may change in future. .br The caller must have CAP_SYSADMIN privileges to do this operation. .PP Return codes: 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to one of the following: .nf EINVAL An invalid parameter was passed to the call ENOMEM A (kernel) memory allocation failed EEXIST The lockspace already exists EPERM Process does not have capability to create lockspaces ENOSYS A fatal error occurred initializing the DLM Any error returned by the open() system call .fi .SS int dlm_new_lockspace(const char *name, mode_t mode, uint32_t flags) .PP Performs the same function as .B dlm_create_lockspace() above, but passes some creation flags to the call that affect the lockspace being created. .fi .SS int dlm_release_lockspace(const char *name, dlm_lshandle_t ls, int force) .PP Deletes a lockspace. If the lockspace still has active locks then -1 will be returned and errno set to EBUSY. Both the lockspace handle /and/ the name must be specified. This call also closes the lockspace and stops the thread associated with the lockspace, if any. .br Note that other nodes in the cluster may still have locks open on this lockspace. This call only removes the lockspace from the current node. If the force flag is set then the lockspace will be removed even if another user on this node has active locks in it. Existing users will NOT be notified if you do this, so be careful. .br The caller must have CAP_SYSADMIN privileges to do this operation. .PP Return codes: 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to one of the following: .nf EINVAL An invalid parameter was passed to the call EPERM Process does not have capability to release lockspaces EBUSY The lockspace could not be freed because it still contains locks and force was not set. .fi .SS dlm_lshandle_t dlm_open_lockspace(const char *name) .PP Opens an already existing lockspace and returns a handle to it. .PP Return codes: 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to an error returned by the open() system call .SS int dlm_close_lockspace(dlm_lshandle_t ls) .br Close the lockspace. Any locks held by this process will be freed. If a thread is associated with this lockspace then it will be stopped. .PP Return codes: 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to one of the following: .nf EINVAL lockspace was not a valid lockspace handle .fi .SH SEE ALSO .BR libdlm (3), .BR dlm_unlock (3), .BR dlm_lock (3), dlm-4.3.0/libdlm/man/dlm_dispatch.3000066400000000000000000000000221461150666200170300ustar00rootroot00000000000000.so man3/libdlm.3 dlm-4.3.0/libdlm/man/dlm_get_fd.3000066400000000000000000000000221461150666200164610ustar00rootroot00000000000000.so man3/libdlm.3 dlm-4.3.0/libdlm/man/dlm_lock.3000066400000000000000000000200361461150666200161700ustar00rootroot00000000000000.TH DLM_LOCK 3 "July 5, 2007" "libdlm functions" .SH NAME dlm_lock \- acquire or convert a DLM lock .SH SYNOPSIS .nf #include int dlm_lock(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range); /* unused */ int dlm_lock_wait(uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void *bastarg, void (*bastaddr) (void *bastarg), void *range); /* unused */ int dlm_ls_lock(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ void (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), void *range); /* unused */ int dlm_ls_lock_wait(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unusued */ void *bastarg, void (*bastaddr) (void *bastarg), void *range); /* unused */ int dlm_ls_lockx(dlm_lshandle_t lockspace, uint32_t mode, struct dlm_lksb *lksb, uint32_t flags, const void *name, unsigned int namelen, uint32_t parent, /* unused */ (*astaddr) (void *astarg), void *astarg, void (*bastaddr) (void *astarg), uint64_t *xid, uint64_t *timeout); .fi .SH DESCRIPTION dlm_lock and its variants acquire and convert locks in the DLM. .PP dlm_lock() operations are asynchronous. If the call to dlm_lock returns an error then the operation has failed and the AST routine will not be called. If dlm_lock returns 0 it is still possible that the lock operation will fail. The AST routine will be called when the locking is complete or has failed and the status is returned in the lksb. .B dlm_lock_wait() will wait until the lock operation has completed and returns the final completion status. .B dlm_ls_lock() is the same as .B dlm_lock() but takes a lockspace argument. This lockspace must have been previously opened by .B dlm_lockspace_open() or .B dlm_lockspace_create(). .PP For conversion operations the name and namelen are ignored and the lock ID in the LKSB is used to identify the lock to be converted. .PP If a lock value block is specified then in general, a grant or a conversion to an equal-level or higher-level lock mode reads the lock value from the resource into the caller's lock value block. When a lock conversion from EX or PW to an equal-level or lower-level lock mode occurs, the contents of the caller's lock value block are written into the resource. If the LVB is invalidated the lksb.sb_flags member will be set to DLM_SBF_VALNOTVALID. Lock values blocks are always 32 bytes long. .PP If the AST routines or parameter are passed to a conversion operation then they will overwrite those values that were passed to a previous dlm_lock call. .PP .B mode Lock mode to acquire or convert to. .nf LKM_NLMODE NULL Lock LKM_CRMODE Concurrent read LKM_CWMODE Concurrent write LKM_PRMODE Protected read LKM_PWMODE Protected write LKM_EXMODE Exclusive .fi .PP .B flags Affect the operation of the lock call: .nf LKF_NOQUEUE Don't queue the lock. If it cannot be granted return -EAGAIN LKF_CONVERT Convert an existing lock LKF_VALBLK Lock has a value block LKF_QUECVT Put conversion to the back of the queue LKF_EXPEDITE Grant a NL lock immediately regardless of other locks on the conversion queue LKF_PERSISTENT Specifies a lock that will not be unlocked when the process exits; it will become an orphan lock. LKF_CONVDEADLK Enable internal conversion deadlock resolution where the lock's granted mode may be set to NL and DLM_SBF_DEMOTED is returned in lksb.sb_flags. LKF_NODLCKWT Do not consider this lock when trying to detect deadlock conditions. LKF_NODLCKBLK Not implemented LKF_NOQUEUEBAST Send blocking ASTs even for NOQUEUE operations LKF_HEADQUE Add locks to the head of the convert or waiting queue LKF_NOORDER Avoid the VMS rules on grant order LKF_ALTPR If the requested mode can't be granted (generally CW), try to grant in PR and return DLM_SBF_ALTMODE. LKF_ALTCW If the requested mode can't be granted (generally PR), try to grant in CW and return DLM_SBF_ALTMODE. LKF_TIMEOUT The lock will time out per the timeout arg. .fi .PP .B lksb Lock Status block .br This structure contains the returned lock ID, the actual status of the lock operation (all lock ops are asynchronous) and the value block if LKF_VALBLK is set. .PP .B name .br Name of the lock. Can be binary, max 64 bytes. Ignored for lock conversions. (Should be a string to work with debugging tools.) .PP .B namelen .br Length of the above name. Ignored for lock conversions. .PP .B parent .br ID of parent lock or NULL if this is a top-level lock. This is currently unused. .PP .B ast .br Address of AST routine to be called when the lock operation completes. The final completion status of the lock will be in the lksb. the AST routine must not be NULL. .PP .B astargs .br Argument to pass to the AST routine (most people pass the lksb in here but it can be anything you like.) .PP .B bast .br Blocking AST routine. address of a function to call if this lock is blocking another. The function will be called with astargs. .PP .B range .br This is unused. .PP .B xid .br Optional transaction ID for deadlock detection. .PP .B timeout .br Timeout in centiseconds. If it takes longer than this to acquire the lock (usually because it is already blocked by another lock), then the AST will trigger with ETIMEDOUT as the status. If the lock operation is a conversion then the lock will remain at its current status. If this is a new lock then the lock will not exist and any LKB in the lksb will be invalid. This is ignored without the LKF_TIMEOUT flag. .PP .SS Return values 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to one of the following: .PP .nf EINVAL An invalid parameter was passed to the call (eg bad lock mode or flag) ENOMEM A (kernel) memory allocation failed EAGAIN LKF_NOQUEUE was requested and the lock could not be granted EBUSY The lock is currently being locked or converted EFAULT The userland buffer could not be read/written by the kernel (this indicates a library problem) EDEADLOCK The lock operation is causing a deadlock and has been cancelled. If this was a conversion then the lock is reverted to its previously granted state. If it was a new lock then it has not been granted. (NB Only conversion deadlocks are currently detected) .PP If an error is returned in the AST, then lksb.sb_status is set to the one of the above values instead of zero. .SS Structures .nf struct dlm_lksb { int sb_status; /* Final status of lock operation */ uint32_t sb_lkid; /* ID of lock. Returned from dlm_lock() on first use. Used as input to dlm_lock() for a conversion operation */ char sb_flags; /* Completion flags, see above */ char sb_lvbptr; /* Optional pointer to lock value block */ }; .fi .SH EXAMPLE .nf int status; struct dlm_lksb lksb; status = dlm_lock_wait(LKM_EXMODE, &lksb, LKF_NOQUEUE, "MyLock", strlen("MyLock"), 0, // Parent, NULL, // bast arg NULL, // bast routine, NULL); // Range if (status == 0) dlm_unlock_wait(lksb.sb_lkid, 0, &lksb); .fi .SH SEE ALSO .BR libdlm (3), .BR dlm_unlock (3), .BR dlm_open_lockspace (3), .BR dlm_create_lockspace (3), .BR dlm_close_lockspace (3), .BR dlm_release_lockspace (3) dlm-4.3.0/libdlm/man/dlm_lock_wait.3000066400000000000000000000000241461150666200172070ustar00rootroot00000000000000.so man3/dlm_lock.3 dlm-4.3.0/libdlm/man/dlm_ls_lock.3000066400000000000000000000000241461150666200166610ustar00rootroot00000000000000.so man3/dlm_lock.3 dlm-4.3.0/libdlm/man/dlm_ls_lock_wait.3000066400000000000000000000000241461150666200177050ustar00rootroot00000000000000.so man3/dlm_lock.3 dlm-4.3.0/libdlm/man/dlm_ls_lockx.3000066400000000000000000000000241461150666200170510ustar00rootroot00000000000000.so man3/dlm_lock.3 dlm-4.3.0/libdlm/man/dlm_ls_pthread_init.3000066400000000000000000000000221461150666200204010ustar00rootroot00000000000000.so man3/libdlm.3 dlm-4.3.0/libdlm/man/dlm_ls_unlock.3000066400000000000000000000000261461150666200172260ustar00rootroot00000000000000.so man3/dlm_unlock.3 dlm-4.3.0/libdlm/man/dlm_ls_unlock_wait.3000066400000000000000000000000261461150666200202520ustar00rootroot00000000000000.so man3/dlm_unlock.3 dlm-4.3.0/libdlm/man/dlm_new_lockspace.3000066400000000000000000000000401461150666200200460ustar00rootroot00000000000000.so man3/dlm_create_lockspace.3 dlm-4.3.0/libdlm/man/dlm_open_lockspace.3000066400000000000000000000000401461150666200202160ustar00rootroot00000000000000.so man3/dlm_create_lockspace.3 dlm-4.3.0/libdlm/man/dlm_pthread_init.3000066400000000000000000000000221461150666200177030ustar00rootroot00000000000000.so man3/libdlm.3 dlm-4.3.0/libdlm/man/dlm_release_lockspace.3000066400000000000000000000000401461150666200206750ustar00rootroot00000000000000.so man3/dlm_create_lockspace.3 dlm-4.3.0/libdlm/man/dlm_unlock.3000066400000000000000000000055441461150666200165420ustar00rootroot00000000000000.TH DLM_UNLOCK 3 "July 5, 2007" "libdlm functions" .SH NAME dlm_unlock \- unlock a DLM lock .SH SYNOPSIS .nf #include int dlm_unlock(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb, void *astarg); int dlm_unlock_wait(uint32_t lkid, uint32_t flags, struct dlm_lksb *lksb); .fi .SH DESCRIPTION .B dlm_unlock() unlocks a lock previously acquired by dlm_lock and its variants. .PP Unless .B dlm_unlock_wait() is used unlocks are also asynchronous. The AST routine is called when the resource is successfully unlocked (see below). .PP .B lkid Lock ID as returned in the lksb .PP .B flags flags affecting the unlock operation: .nf LKF_CANCEL Cancel a pending lock or conversion. This returns the lock to it's previously granted mode (in case of a conversion) or unlocks it (in case of a waiting lock). LKF_IVVALBLK Invalidate value block LKF_FORCEUNLOCK Unlock the lock even if it's waiting. .fi .PP .B lksb LKSB to return status and value block information. .PP .B astarg New parameter to be passed to the completion AST. The completion AST routine is the last completion AST routine specified in a dlm_lock call. If dlm_lock_wait() was the last routine to issue a lock, dlm_unlock_wait() must be used to release the lock. If dlm_lock() was the last routine to issue a lock then either dlm_unlock() or dlm_unlock_wait() may be called. .PP .SS Return values 0 is returned if the call completed successfully. If not, -1 is returned and errno is set to one of the following: .PP .nf EINVAL An invalid parameter was passed to the call (eg bad lock mode or flag) EINPROGRESS The lock is already being unlocked EBUSY The lock is currently being locked or converted ENOTEMPTY An attempt to made to unlock a parent lock that still has child locks. ECANCEL A lock conversion was successfully cancelled EUNLOCK An unlock operation completed successfully (sb_status only) EFAULT The userland buffer could not be read/written by the kernel .fi If an error is returned in the AST, then lksb.sb_status is set to the one of the above numbers instead of zero. .SH EXAMPLE .nf int status; struct dlm_lksb lksb; status = dlm_lock_wait(LKM_EXMODE, &lksb, LKF_NOQUEUE, "MyLock", strlen("MyLock"), 0, // Parent, NULL, // bast arg NULL, // bast routine, NULL); // Range if (status == 0) dlm_unlock_wait(lksb.sb_lkid, 0, &lksb); .fi .SH SEE ALSO .BR libdlm (3), .BR dlm_lock (3), .BR dlm_open_lockspace (3), .BR dlm_create_lockspace (3), .BR dlm_close_lockspace (3), .BR dlm_release_lockspace (3) dlm-4.3.0/libdlm/man/dlm_unlock_wait.3000066400000000000000000000000261461150666200175540ustar00rootroot00000000000000.so man3/dlm_unlock.3 dlm-4.3.0/libdlm/man/libdlm.3000066400000000000000000000070721461150666200156540ustar00rootroot00000000000000.TH LIBDLM 3 "July 5, 2007" "libdlm functions" .SH NAME libdlm \- dlm_get_fd, dlm_dispatch, dlm_pthread_init, dlm_ls_pthread_init, dlm_cleanup .SH SYNOPSIS .nf #include .nf int dlm_pthread_init(); int dlm_ls_pthread_init(dlm_lshandle_t lockspace); int dlm_pthread_cleanup(); int dlm_get_fd(void); int dlm_dispatch(int fd); link with -ldlm .fi .SH DESCRIPTION libdlm provides the programmatic userspace interface to the Distributed Lock manager. It provides all the calls you need to manipulate locks & lockspaces .br libdlm can be used in pthread or non-pthread applications. For pthread applications simply call the following function before doing any lock operations. If you're using pthreads, remember to define _REENTRANT at the top of the program or using -D_REENTRANT on the compile line. .br pthreads is the normal way of using the DLM. This way you simply initialize the DLM's thread and all the AST routines will be delivered in that thread. You just call the dlm_lock() etc routines in the main line of your program. .br If you don't want to use pthreads or you want to handle the dlm callback ASTs yourself then you can get an FD handle to the DLM device and call .B dlm_dispatch() on it whenever it becomes active. That was ASTs will be delivered in the context of the thread/process that called .B dlm_dispatch(). .SS int dlm_pthread_init() .br Creates a thread to receive all lock ASTs. The AST callback function for lock operations will be called in the context of this thread. If there is a potential for local resource access conflicts you must provide your own pthread-based locking in the AST routine. .PP .SS int dlm_ls_pthread_init(dlm_lshandle_t lockspace) .br As dlm_pthread_init but initializes a thread for the specified lockspace. .PP .SS int dlm_pthread_cleanup() .br Cleans up the default lockspace threads after use. Normally you don't need to call this, but if the locking code is in a dynamically loadable shared library this will probably be necessary. .br For non-pthread based applications the DLM provides a file descriptor that the program can feed into poll/select. If activity is detected on that FD then a dispatch function should be called: .PP .SS int dlm_get_fd() Returns a file-descriptor for the DLM suitable for passing in to poll() or select(). .PP .SS int dlm_dispatch(int fd) .br Reads from the DLM and calls any AST routines that may be needed. This routine runs in the context of the caller so no extra locking is needed to protect local resources. .PP .SH libdlm_lt There also exists a "light" version of the libdlm library called libdlm_lt. This is provided for those applications that do not want to use pthread functions. If you use this library it is important that your application is NOT compiled with -D_REENTRANT or linked with libpthread. .SH EXAMPLES Create a lockspace and start a thread to deliver its callbacks: .nf dlm_lshandle_t ls; ls = dlm_create_lockspace("myLS", 0660); dlm_ls_pthread_init(ls); ... status = dlm_ls_lock(ls, ... ); .fi .PP Using poll(2) to wait for and dispatch ASTs .nf static int poll_for_ast(dlm_lshandle_t ls) { struct pollfd pfd; pfd.fd = dlm_ls_get_fd(ls); pfd.events = POLLIN; while (!ast_called) { if (poll(&pfd, 1, 0) < 0) { perror("poll"); return -1; } dlm_dispatch(dlm_ls_get_fd(ls)); } ast_called = 0; return 0; } .fi .SH SEE ALSO .BR libdlm (3), .BR dlm_lock (3), .BR dlm_unlock (3), .BR dlm_open_lockspace (3), .BR dlm_create_lockspace (3), .BR dlm_close_lockspace (3), .BR dlm_release_lockspace (3) dlm-4.3.0/python/000077500000000000000000000000001461150666200136225ustar00rootroot00000000000000dlm-4.3.0/python/README000066400000000000000000000004161461150666200145030ustar00rootroot00000000000000This directory contains python bindings to libdlm and short written tests in python to test dlm functionality. NOTE: the bindings are still experimental and only used for testing now. Future work: - look into pytest - add ebpf examples - look into MPI for barrier() dlm-4.3.0/python/bindings/000077500000000000000000000000001461150666200154175ustar00rootroot00000000000000dlm-4.3.0/python/bindings/__init__.py000066400000000000000000000000001461150666200175160ustar00rootroot00000000000000dlm-4.3.0/python/bindings/dlm.py000066400000000000000000000127451461150666200165560ustar00rootroot00000000000000from enum import IntEnum, IntFlag import sys, os, errno, platform import ctypes, ctypes.util #bindings if platform.system() != "Linux": sys.exit("Not supported") c_path_libdlm = ctypes.util.find_library("dlm_lt") if not c_path_libdlm: print("Unable to find the specified library.") sys.exit() try: c_libdlm = ctypes.CDLL(c_path_libdlm) except OSError: print("Unable to load the system C library") sys.exit() class C_DLM_LSHANDLE(ctypes.c_void_p): pass class C_DLM_LKSB(ctypes.Structure): _fields_ = [('sb_status', ctypes.c_int), ('sb_lkid', ctypes.c_uint32), ('sb_flags', ctypes.c_char), ('sb_lvbptr', ctypes.c_char_p)] C_BAST_CB = ctypes.CFUNCTYPE(None, ctypes.py_object) #dlm_create_lockspace c_dlm_create_lockspace = c_libdlm.dlm_create_lockspace c_dlm_create_lockspace.argtypes = [ctypes.c_char_p, #name ctypes.c_uint, #mode ] c_dlm_create_lockspace.restype = C_DLM_LSHANDLE #dlm_release_lockspace c_dlm_release_lockspace = c_libdlm.dlm_release_lockspace c_dlm_release_lockspace.argtypes = [ctypes.c_char_p, #name C_DLM_LSHANDLE, #ls ctypes.c_int, #force ] c_dlm_release_lockspace.restype = ctypes.c_int #dlm_ls_lock_wait c_dlm_ls_lock_wait = c_libdlm.dlm_ls_lock_wait c_dlm_ls_lock_wait.argtypes = [C_DLM_LSHANDLE, #ls ctypes.c_uint32, #mode ctypes.POINTER(C_DLM_LKSB), #lksb ctypes.c_uint32, #flags ctypes.c_char_p, #name ctypes.c_uint, #namelen ctypes.c_uint32, #parent ctypes.py_object, #bastarg C_BAST_CB, #bastaddr ctypes.c_void_p, #range ] c_dlm_ls_lock_wait.restype = ctypes.c_int #dlm_ls_unlock_wait c_dlm_ls_unlock_wait = c_libdlm.dlm_ls_unlock_wait c_dlm_ls_unlock_wait.argtypes = [C_DLM_LSHANDLE, #ls ctypes.c_uint32, #lkid ctypes.c_uint32, #flags ctypes.POINTER(C_DLM_LKSB), #lksb ] c_dlm_ls_unlock_wait.restype = ctypes.c_int #classes class LockMode(IntEnum): IV = -1 NL = 0 CR = 1 CW = 2 PR = 3 PW = 4 EX = 5 class LockFlag(IntFlag): NOQUEUE = 0x00000001 CANCEL = 0x00000002 CONVERT = 0x00000004 VALBLK = 0x00000008 QUECVT = 0x00000010 IVVALBLK = 0x00000020 CONVDEADLK = 0x00000040 PERSISTENT = 0x00000080 NODLCKWT = 0x00000100 NODLCKBLK = 0x00000200 EXPEDITE = 0x00000400 NOQUEUEBAST = 0x00000800 HEADQUE = 0x00001000 NOORDER = 0x00002000 ORPHAN = 0x00004000 ALTPR = 0x00008000 ALTCW = 0x00010000 FORCEUNLOCK = 0x00020000 TIMEOUT = 0x00040000 class LockSBFlag(IntFlag): DEMOTED = 0x01 VALNOTVALID = 0x02 ALTMODE = 0x04 class DLMError(OSError): def __init__(self, errno): if not errno < 0: raise ValueError() errno = abs(errno) super().__init__(errno, os.strerror(errno)) DLM_LOCK_TO_STR_FORMAT = \ """name: {} last_mode: {} last_flags: {} local_locked: {} last_sb: status: {}, lkid: {}, flags: {}, lvb: {}""" class Lockspace: def __init__(self, name="default", mode=0o600): self.__lsname = name self.__ls = c_dlm_create_lockspace(self.__lsname.encode(), mode) if not self.__ls: raise DLMError(-errno.ENOMEM) def release(self, force=2): if not self.__ls: return rc = c_dlm_release_lockspace(self.__lsname.encode(), self.__ls, force) if rc: raise DLMError(rc) self.__ls = None def __del__(self): self.release() def __str__(self): return "Lockspace: {}".format(self.__lsname) def get_name(self): return self.__lsname # lockspace lock factory def create_lock(self, name): class Lock: #note name should be 8 byte aligned for now def __init__(self, ls, c_ls, name): self.__local_locked = False self.__last_mode = LockMode.IV self.__last_flags = LockFlag(0) self.__lk = C_DLM_LKSB() self.__lk.sb_status = 0 self.__lk.sb_lkid = 0 self.__lk.sb_flags = LockSBFlag(0) self.__lk.sb_lvbptr = None self.__ls = ls self.__c_ls = c_ls self.__name = name def __del__(self): if self.__local_locked: self.unlock_wait() def __str__(self): sb = self.get_sb() return DLM_LOCK_TO_STR_FORMAT.format( self.__name, str(self.__last_mode), str(self.__last_flags), self.__local_locked, str(sb.status), sb.lkid, str(sb.flags), str(sb.lvb)) def get_name(self): return self.__name def get_ls(self): return self.__ls # get a copy of current sb state in high-level python def get_sb(self): class LockSB: def __init__(self, status, lkid, flags, lvb): self.status = status self.lkid = lkid self.flags = LockSBFlag(flags[0]) self.lvb = lvb return LockSB(self.__lk.sb_status, self.__lk.sb_lkid, self.__lk.sb_flags, self.__lk.sb_lvbptr) def lock_wait(self, mode=LockMode.EX, flags=LockFlag(0), bast=None, bastarg=None): if bast: bast = C_BAST_CB(bast) else: bast = ctypes.cast(None, C_BAST_CB) rc = c_dlm_ls_lock_wait(self.__c_ls, mode, ctypes.byref(self.__lk), flags, self.__name.encode(), len(self.__name), 0, bastarg, bast, None) if rc: raise DLMError(rc) self.__last_mode = mode self.__last_flags = flags self.__local_locked = True def unlock_wait(self, flags=0): rc = c_dlm_ls_unlock_wait(self.__c_ls, self.__lk.sb_lkid, flags, ctypes.byref(self.__lk)) if rc: raise DLMError(rc) self.__last_flags = flags self.__local_locked = False lock = Lock(self, self.__ls, name) return lock # vim: tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab dlm-4.3.0/python/ebpf/000077500000000000000000000000001461150666200145365ustar00rootroot00000000000000dlm-4.3.0/python/ebpf/README000066400000000000000000000004501461150666200154150ustar00rootroot00000000000000In this directory are some ebpf tracing examples. It requires bcc toolchain (usually known as bcc package name) and the python bcc module (usually known as python3-bcc package name). Also the current kernel headers need to be available or activate CONFIG_IKHEADERS in your kernel configuration. dlm-4.3.0/python/ebpf/dlmhist.py000077500000000000000000000031551461150666200165630ustar00rootroot00000000000000#!/usr/bin/python # # This example shows how to capture latency between a dlm_lock() kernel # call for DLM_LOCK_EX requests with flag DLM_LKF_NOQUEUE and the ast # response. # # You will probably see two line peaks, one in case of that the current # node is the lock master and another one which requires network # communication. There is currently no way to filter them out, so the # second peak is interessting to get an idea what time it takes to # call dlm_lock() and get a response back. from bcc import BPF import threading b = BPF(text=""" #include #include BPF_HASH(start, u32); BPF_HISTOGRAM(dist, u64); #define DLM_HASH(args) (args->ls_id ^ args->lkb_id) TRACEPOINT_PROBE(dlm, dlm_lock_start) { u64 ts = bpf_ktime_get_ns(); u32 hash = DLM_HASH(args); if (args->flags & DLM_LKF_NOQUEUE && args->mode == DLM_LOCK_EX) start.update(&hash, &ts); return 0; } TRACEPOINT_PROBE(dlm, dlm_lock_end) { u32 hash = DLM_HASH(args); if (args->error != 0) start.delete(&hash); return 0; } TRACEPOINT_PROBE(dlm, dlm_ast) { u32 hash = DLM_HASH(args); u64 *tsp, delta; tsp = start.lookup(&hash); if (tsp != 0) { start.delete(&hash); delta = bpf_ktime_get_ns() - *tsp; if (args->sb_status != 0) return 0; dist.increment(bpf_log2l(delta)); } return 0; } """) print("Tracing... Hit Ctrl-C anytime to end.") forever = threading.Event() try: forever.wait() except KeyboardInterrupt: print() print("log2 histogram") print("--------------") b["dist"].print_log2_hist("ns") dlm-4.3.0/python/tests/000077500000000000000000000000001461150666200147645ustar00rootroot00000000000000dlm-4.3.0/python/tests/dlm.py000077700000000000000000000000001461150666200212432../bindings/dlm.pyustar00rootroot00000000000000dlm-4.3.0/python/tests/recovery_interrupt000077500000000000000000000027761461150666200207000ustar00rootroot00000000000000#!/bin/env python3 from signal import signal, SIGINT from dlm import Lockspace import argparse import logging import time parser = argparse.ArgumentParser() parser.add_argument('-l', '--lock', action='store_true', help='do lock activity between ls start/stop') parser.add_argument('-w', '--wait', help='wait time for contention', type=int, default=1) parser.add_argument("-d", "--debug", default="info", help=("logging debug level"), choices=["debug", "info", "warning"]) args = parser.parse_args() debug_levels = {"debug": logging.DEBUG, "info": logging.INFO, "warning": logging.WARNING} debug_level = debug_levels[args.debug.lower()] logging.basicConfig(level=debug_level, format="%(asctime)s:%(levelname)s: %(message)s") def handler(signal, frame): global end end = True signal(SIGINT, handler) end = False while not end: ls = Lockspace() lsname = ls.get_name() logging.info("lockspace {} created".format(lsname)) if args.lock: lock = ls.create_lock("fooobaar") lockname = lock.get_name() logging.info("lock {} created".format(lockname)) lock.lock_wait() logging.info("lock {} lock()".format(lockname)) #contention logging.info("lock {} wait for {} secs".format(lockname, args.wait)) time.sleep(args.wait) lock.unlock_wait() logging.info("lock {} unlock()".format(lockname)) del lock logging.info("lock {} removed".format(lockname)) del ls logging.info("lockspace {} removed".format(lsname)) # vim: tabstop=8 softtabstop=8 shiftwidth=8 noexpandtab