sanlock-2.2/0000755000175100017510000000000011751766670012023 5ustar weberwebersanlock-2.2/wdmd/0000755000175100017510000000000011751766670012756 5ustar weberwebersanlock-2.2/wdmd/wdmd.80000644000175100017510000000444111751766670014005 0ustar weberweber.TH WDMD 8 2011-08-01 .SH NAME wdmd \- watchdog multiplexing daemon .SH SYNOPSIS .B wdmd [OPTIONS] .SH DESCRIPTION This daemon opens /dev/watchdog and allows multiple independent sources to detmermine whether each KEEPALIVE is done. Every test interval (10 seconds), the daemon tests each source. If any test fails, the KEEPALIVE is not done. In a standard configuration, the watchdog timer will reset the system if no KEEPALIVE is done for 60 seconds ("fire timeout"). This means that if single test fails 5-6 times in row, the watchdog will fire and reset the system. With multiple test sources, fewer separate failures back to back can also cause a reset, e.g. T seconds, P pass, F fail .br T00: test1 P, test2 P, test3 P: KEEPALIVE done .br T10: test1 F, test2 F, test3 P: KEEPALIVE skipped .br T20: test1 F, test2 P, test3 P: KEEPALIVE skipped .br T30: test1 P, test2 F, test3 P: KEEPALIVE skipped .br T40: test1 P, test2 P, test3 F: KEEPALIVE skipped .br T50: test1 F, test2 F, test3 P: KEEPALIVE skipped .br T60: test1 P, test2 F, test3 P: KEEPALIVE skipped .br T60: watchdog fires, system resets (Depending on timings, the system may be reset sometime shortly before T60, and the tests at T60 would not be run.) A crucial aspect to the design and function of wdmd is that if any single source does not pass tests for the fire timeout, the watchdog is guaranteed to fire, regardless of whether other sources on the system have passed or failed. A spurious reset due to the combined effects of multiple failing tests as shown above, is an accepted side effect. wdmd will exit if a watchdog driver is not loaded. wdmd cannot be used on the system with any other program that needs to open /dev/watchdog, e.g. watchdog(8). .SS Test Source: clients Using libwdmd, programs connect to wdmd via a unix socket, and send regular messages to wdmd to update an expiry time for their connection. Every test interval, wdmd will check if the expiry time for a connection has been reached. If so, the test for that client fails. (Other test sources such as scripts executed each test interval may be added in the future.) .SH OPTIONS .TP .B \-D Enable debugging to stderr and don't fork. .TP .BI \-H " num" Enable (1) or disable (0) high priority features such as realtime scheduling priority and mlockall. .br Default 1. sanlock-2.2/wdmd/wdmd_sock.h0000644000175100017510000000147611751766670015111 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __WDMD_SOCK_H__ #define __WDMD_SOCK_H__ #define WDMD_RUN_DIR "/var/run/wdmd" #define WDMD_SOCKET_NAME "wdmd.sock" enum { CMD_REGISTER = 1, CMD_REFCOUNT_SET, CMD_REFCOUNT_CLEAR, CMD_TEST_LIVE, CMD_STATUS, }; struct wdmd_header { uint32_t magic; uint32_t cmd; uint32_t len; uint32_t flags; uint32_t test_interval; uint32_t fire_timeout; uint64_t last_keepalive; uint64_t renewal_time; uint64_t expire_time; char name[WDMD_NAME_SIZE]; }; int wdmd_socket_address(struct sockaddr_un *addr); #endif sanlock-2.2/wdmd/wdmd_client.c0000644000175100017510000000277411751766670015425 0ustar weberweber/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" int main(int argc, char *argv[]) { char name[WDMD_NAME_SIZE]; uint64_t t, last_keepalive; int test_interval, fire_timeout; int con, rv; int i = 0; int iter = 10; if (argc > 1) iter = atoi(argv[1]); memset(name, 0, sizeof(name)); sprintf(name, "%s", "wdmd_client"); con = wdmd_connect(); printf("wdmd_connect %d\n", con); if (con < 0) return con; rv = wdmd_register(con, name); printf("wdmd_register %d\n", rv); if (rv < 0) return rv; rv = wdmd_status(con, &test_interval, &fire_timeout, &last_keepalive); printf("wdmd_status %d test_interval %d fire_timeout %d last_keepalive %llu\n", rv, test_interval, fire_timeout, (unsigned long long)last_keepalive); if (rv < 0) return rv; while (1) { sleep(10); t = time(NULL); rv = wdmd_test_live(con, t, t + 40); printf("wdmd_test_live %d %llu %llu\n", rv, (unsigned long long)t, (unsigned long long)(t + 40)); if (i++ > iter) break; } rv = wdmd_test_live(con, t, 0); printf("wdmd_test_live 0 %d\n", rv); return 0; } sanlock-2.2/wdmd/Makefile0000644000175100017510000000437011751766670014422 0ustar weberweber# Copyright (C) 2011 Red Hat, Inc. All rights reserved. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v2 or (at your option) any later version. CMD_TARGET = wdmd LIB_TARGET = libwdmd HEADER_TARGET = wdmd.h MAN_TARGET = wdmd.8 TEST_TARGET = wdmd_client SOMAJOR=1 SOMINOR=0 SHLIB_TARGET = $(LIB_TARGET).so.$(SOMAJOR).$(SOMINOR) CMD_SOURCE = main.c wdmd_sock.c LIB_SOURCE = client.c wdmd_sock.c TEST_SOURCE = wdmd_client.c CFLAGS += -D_GNU_SOURCE -g \ -Wall \ -Wformat \ -Wformat-security \ -Wmissing-prototypes \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option \ -fPIE -DPIE CMD_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie CMD_LDADD += -lwdmd -lrt LIB_LDFLAGS += -Wl,-z,relro -pie TEST_LDFLAGS = -lwdmd all: $(SHLIB_TARGET) $(CMD_TARGET) $(TEST_TARGET) $(SHLIB_TARGET): $(LIB_SOURCE) $(CC) $(CFLAGS) $(LIB_LDFLAGS) -shared -fPIC -o $@ -Wl,-soname=$(LIB_TARGET).so.$(SOMAJOR) $^ ln -sf $(SHLIB_TARGET) $(LIB_TARGET).so ln -sf $(SHLIB_TARGET) $(LIB_TARGET).so.$(SOMAJOR) $(CMD_TARGET): $(SHLIB_TARGET) $(CMD_SOURCE) $(CC) $(CFLAGS) $(CMD_LDFLAGS) $(CMD_SOURCE) $(CMD_LDADD) -o $@ -L. $(TEST_TARGET): $(SHLIB_TARGET) $(TEST_SOURCE) $(CC) $(CFLAGS) $(TEST_LDFLAGS) $(TEST_SOURCE) $(CMD_LDADD) -o $@ -L. clean: rm -f *.o *.so *.so.* $(CMD_TARGET) $(TEST_TARGET) INSTALL=$(shell which install) DESTDIR= BINDIR=/usr/sbin LIBDIR=/usr/lib64 HEADIR=/usr/include MANDIR=/usr/share/man .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(LIBDIR) $(INSTALL) -d $(DESTDIR)/$(HEADIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -c -m 755 $(CMD_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -c -m 755 $(SHLIB_TARGET) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_TARGET).so $(DESTDIR)/$(LIBDIR) cp -a $(LIB_TARGET).so.$(SOMAJOR) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 644 $(HEADER_TARGET) $(DESTDIR)/$(HEADIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8 sanlock-2.2/wdmd/wdmd_sock.c0000644000175100017510000000134211751766670015074 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" int wdmd_socket_address(struct sockaddr_un *addr) { memset(addr, 0, sizeof(struct sockaddr_un)); addr->sun_family = AF_LOCAL; snprintf(addr->sun_path, sizeof(addr->sun_path) - 1, "%s/%s", WDMD_RUN_DIR, WDMD_SOCKET_NAME); return 0; } sanlock-2.2/wdmd/client.c0000644000175100017510000000447611751766670014413 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" int wdmd_connect(void) { int rv, s; struct sockaddr_un addr; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; rv = wdmd_socket_address(&addr); if (rv < 0) return rv; rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } return s; } int wdmd_register(int con, char *name) { struct wdmd_header h; int rv; if (strlen(name) > WDMD_NAME_SIZE) return -ENAMETOOLONG; memset(&h, 0, sizeof(h)); h.cmd = CMD_REGISTER; strncpy(h.name, name, WDMD_NAME_SIZE); rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } static int send_header(int con, int cmd) { struct wdmd_header h; int rv; memset(&h, 0, sizeof(h)); h.cmd = cmd; rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } int wdmd_refcount_set(int con) { return send_header(con, CMD_REFCOUNT_SET); } int wdmd_refcount_clear(int con) { return send_header(con, CMD_REFCOUNT_CLEAR); } int wdmd_test_live(int con, uint64_t renewal_time, uint64_t expire_time) { struct wdmd_header h; int rv; memset(&h, 0, sizeof(h)); h.cmd = CMD_TEST_LIVE; h.renewal_time = renewal_time; h.expire_time = expire_time; rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } int wdmd_status(int con, int *test_interval, int *fire_timeout, uint64_t *last_keepalive) { struct wdmd_header h; int rv; rv = send_header(con, CMD_STATUS); if (rv < 0) return rv; rv = recv(con, &h, sizeof(h), MSG_WAITALL); if (rv < 0) return -errno; *test_interval = h.test_interval; *fire_timeout = h.fire_timeout; *last_keepalive = h.last_keepalive; return 0; } sanlock-2.2/wdmd/main.c0000644000175100017510000004727211751766670014062 0ustar weberweber/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif #define RELEASE_VERSION "2.2" #define DEFAULT_TEST_INTERVAL 10 #define DEFAULT_FIRE_TIMEOUT 60 #define DEFAULT_HIGH_PRIORITY 1 #define DEFAULT_SOCKET_GID 0 #define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) static int test_interval = DEFAULT_TEST_INTERVAL; static int fire_timeout = DEFAULT_FIRE_TIMEOUT; static int high_priority = DEFAULT_HIGH_PRIORITY; static int daemon_quit; static int daemon_debug; static int socket_gid; static time_t last_keepalive; static char lockfile_path[PATH_MAX]; static int dev_fd; struct script_status { int pid; char path[PATH_MAX]; }; /* The relationship between SCRIPT_WAIT_SECONDS/MAX_SCRIPTS/test_interval is not very sophisticated, but it's simple. If we wait up to 2 seconds for each script to exit, and have 5 scripts, that's up to 10 seconds we spend in test_scripts, and it's simplest if the max time in test_scripts does not excede the test_interval (10). */ #define SCRIPT_WAIT_SECONDS 2 #define MAX_SCRIPTS 4 struct script_status scripts[MAX_SCRIPTS]; struct client { int used; int fd; int pid; int pid_dead; int refcount; uint64_t renewal; uint64_t expire; void *workfn; void *deadfn; char name[WDMD_NAME_SIZE]; }; #define CLIENT_NALLOC 16 static int client_maxi; static int client_size = 0; static struct client *client = NULL; static struct pollfd *pollfd = NULL; const char *client_built = " client"; #define log_debug(fmt, args...) \ do { \ if (daemon_debug) \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_ERR, fmt, ##args); \ } while (0) static uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } /* * test clients */ static void client_alloc(void) { int i; if (!client) { client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); } else { client = realloc(client, (client_size + CLIENT_NALLOC) * sizeof(struct client)); pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) * sizeof(struct pollfd)); if (!pollfd) log_error("can't alloc for pollfd"); } if (!client || !pollfd) log_error("can't alloc for client array"); for (i = client_size; i < client_size + CLIENT_NALLOC; i++) { memset(&client[i], 0, sizeof(struct client)); client[i].fd = -1; pollfd[i].fd = -1; pollfd[i].revents = 0; } client_size += CLIENT_NALLOC; } static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { int i; if (!client) client_alloc(); again: for (i = 0; i < client_size; i++) { if (!client[i].used) { client[i].used = 1; client[i].workfn = workfn; client[i].deadfn = deadfn; client[i].fd = fd; pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; return i; } } client_alloc(); goto again; } static void client_pid_dead(int ci) { if (!client[ci].expire) { log_debug("client_pid_dead ci %d", ci); close(client[ci].fd); client[ci].used = 0; memset(&client[ci], 0, sizeof(struct client)); client[ci].fd = -1; pollfd[ci].fd = -1; pollfd[ci].events = 0; } else { /* test_clients() needs to continue watching this ci so it can expire */ log_debug("client_pid_dead ci %d expire %llu", ci, (unsigned long long)client[ci].expire); close(client[ci].fd); client[ci].pid_dead = 1; client[ci].refcount = 0; client[ci].fd = -1; pollfd[ci].fd = -1; pollfd[ci].events = 0; } } static int get_peer_pid(int fd, int *pid) { struct ucred cred; unsigned int cl = sizeof(cred); if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &cl) != 0) return -1; *pid = cred.pid; return 0; } static void process_connection(int ci) { struct wdmd_header h; struct wdmd_header h_ret; void (*deadfn)(int ci); int rv, pid; memset(&h, 0, sizeof(h)); rv = recv(client[ci].fd, &h, sizeof(h), MSG_WAITALL); if (!rv) return; if (rv < 0) { log_error("ci %d recv error %d", ci, errno); goto dead; } if (rv != sizeof(h)) { log_error("ci %d recv size %d", ci, rv); goto dead; } switch(h.cmd) { case CMD_REGISTER: /* TODO: allow client to reconnect, search clients for h.name and copy the renewal and expire times, then clear the old client entry */ rv = get_peer_pid(client[ci].fd, &pid); if (rv < 0) goto dead; client[ci].pid = pid; memcpy(client[ci].name, h.name, WDMD_NAME_SIZE); log_debug("register ci %d fd %d pid %d %s", ci, client[ci].fd, pid, client[ci].name); break; case CMD_REFCOUNT_SET: client[ci].refcount = 1; break; case CMD_REFCOUNT_CLEAR: client[ci].refcount = 0; break; case CMD_TEST_LIVE: client[ci].renewal = h.renewal_time; client[ci].expire = h.expire_time; log_debug("test_live ci %d renewal %llu expire %llu", ci, (unsigned long long)client[ci].renewal, (unsigned long long)client[ci].expire); break; case CMD_STATUS: memcpy(&h_ret, &h, sizeof(h)); h_ret.test_interval = test_interval; h_ret.fire_timeout = fire_timeout; h_ret.last_keepalive = last_keepalive; send(client[ci].fd, &h_ret, sizeof(h_ret), MSG_NOSIGNAL); break; }; return; dead: deadfn = client[ci].deadfn; if (deadfn) deadfn(ci); } static void process_listener(int ci) { int fd; int on = 1; fd = accept(client[ci].fd, NULL, NULL); if (fd < 0) return; setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)); client_add(fd, process_connection, client_pid_dead); } static void close_clients(void) { } static int setup_listener_socket(int *listener_socket) { int rv, s; struct sockaddr_un addr; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; rv = wdmd_socket_address(&addr); if (rv < 0) return rv; unlink(addr.sun_path); rv = bind(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } rv = listen(s, 5); if (rv < 0) { rv = -errno; close(s); return rv; } rv = chmod(addr.sun_path, DEFAULT_SOCKET_MODE); if (rv < 0) { rv = -errno; close(s); return rv; } rv = chown(addr.sun_path, -1, socket_gid); if (rv < 0) { rv = -errno; close(s); return rv; } fcntl(s, F_SETFL, fcntl(s, F_GETFL, 0) | O_NONBLOCK); *listener_socket = s; return 0; } static int setup_clients(void) { int rv, fd = -1, ci; rv = setup_listener_socket(&fd); if (rv < 0) return rv; ci = client_add(fd, process_listener, client_pid_dead); return 0; } static int test_clients(void) { uint64_t t; int fail_count = 0; int i; t = monotime(); for (i = 0; i < client_size; i++) { if (!client[i].used) continue; if (!client[i].expire) continue; if (t >= client[i].expire) { log_error("test failed pid %d renewal %llu expire %llu", client[i].pid, (unsigned long long)client[i].renewal, (unsigned long long)client[i].expire); fail_count++; } } return fail_count; } static int active_clients(void) { int i; for (i = 0; i < client_size; i++) { if (client[i].refcount) return 1; } return 0; } #ifdef TEST_FILES #define FILES_DIR "/var/run/wdmd/test_files" const char *files_built = " files"; static DIR *files_dir; static void close_files(void) { closedir(files_dir); } static int setup_files(void) { mode_t old_umask; int rv; old_umask = umask(0022); rv = mkdir(FILES_DIR, 0777); if (rv < 0 && errno != EEXIST) goto out; files_dir = opendir(FILES_DIR); if (!files_dir) rv = -errno; else rv = 0; out: umask(old_umask); return rv; } static int read_file(char *name, uint64_t *renewal, uint64_t *expire) { FILE *file; char path[PATH_MAX]; snprintf(path, PATH_MAX-1, "%s/%s", FILES_DIR, name); file = fopen(path, "r"); if (!file) return -1; fscanf(file, "renewal %llu expire %llu", renewal, expire); fclose(file); return 0; } static int test_files(void) { struct dirent *de; uint64_t t, renewal, expire; int fail_count = 0; int rv; while ((de = readdir(files_dir))) { if (de->d_name[0] == '.') continue; rv = read_file(de->d_name, &renewal, &expire); if (rv < 0) continue; t = monotime(); if (t >= expire) { log_error("test failed file %s renewal %llu expire %llu ", de->d_name, (unsigned long long)renewal, (unsigned long long)expire); fail_count++; } } return fail_count; } #else const char *files_built = NULL; static void close_files(void) { } static int setup_files(void) { return 0; } static int test_files(void) { return 0; } #endif /* TEST_FILES */ #ifdef TEST_SCRIPTS #define SCRIPTS_DIR "/etc/wdmd/test_scripts" static DIR *scripts_dir; const char *scripts_built = " scripts"; static void close_scripts(void) { closedir(scripts_dir); } static int setup_scripts(void) { mode_t old_umask; int rv; old_umask = umask(0022); rv = mkdir(SCRIPTS_DIR, 0777); if (rv < 0 && errno != EEXIST) goto out; scripts_dir = opendir(SCRIPTS_DIR); if (!scripts_dir) rv = -errno; else rv = 0; out: umask(old_umask); return rv; } static int run_script(char *name, int i) { int pid; if (i >= MAX_SCRIPTS) { log_error("max scripts %d, ignore %s", MAX_SCRIPTS, name); return -1; } snprintf(scripts[i].path, PATH_MAX-1, "%s/%s", SCRIPTS_DIR, name); pid = fork(); if (pid < 0) return -errno; if (pid) { log_debug("run_script %d %s", pid, name); scripts[i].pid = pid; return 0; } else { execlp(scripts[i].path, scripts[i].path, NULL); exit(EXIT_FAILURE); } } static int check_script(int i) { time_t begin; int status; int rv; if (!scripts[i].pid) return 0; begin = monotime(); while (1) { rv = waitpid(scripts[i].pid, &status, WNOHANG); if (rv < 0) { goto out; } else if (!rv) { /* pid still running */ if (monotime() - begin >= SCRIPT_WAIT_SECONDS) { rv = -ETIMEDOUT; goto out; } sleep(1); } else if (WIFEXITED(status)) { /* pid exited */ if (!WEXITSTATUS(status)) rv = 0; else rv = -1; goto out; } else { /* pid state changed but still running */ if (monotime() - begin >= 2) { rv = -ETIMEDOUT; goto out; } sleep(1); } } out: log_debug("check_script %d rv %d begin %llu", scripts[i].pid, rv, (unsigned long long)begin); scripts[i].pid = 0; return rv; } static int test_scripts(void) { struct dirent *de; int fail_count = 0; int run_count = 0; int i, rv; memset(scripts, 0, sizeof(scripts)); rewinddir(scripts_dir); while ((de = readdir(scripts_dir))) { if (de->d_name[0] == '.') continue; rv = run_script(de->d_name, run_count); if (!rv) run_count++; } for (i = 0; i < run_count; i++) { rv = check_script(i); if (rv < 0) { log_error("test failed script %s", scripts[i].path); fail_count++; } } return fail_count; } #else const char *scripts_built = NULL; static void close_scripts(void) { } static int setup_scripts(void) { return 0; } static int test_scripts(void) { return 0; } #endif /* TEST_SCRIPTS */ static void close_watchdog(void) { int rv; rv = write(dev_fd, "V", 1); if (rv < 0) log_error("/dev/watchdog disarm write error %d", errno); else log_error("/dev/watchdog disarmed"); close(dev_fd); } static int setup_watchdog(void) { int rv, timeout; dev_fd = open("/dev/watchdog", O_WRONLY | O_CLOEXEC); if (dev_fd < 0) { log_error("no /dev/watchdog, load a watchdog driver"); return dev_fd; } timeout = 0; rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &timeout); if (rv < 0) { log_error("/dev/watchdog failed to report timeout"); close_watchdog(); return -1; } if (timeout == fire_timeout) goto out; timeout = fire_timeout; rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &timeout); if (rv < 0) { log_error("/dev/watchdog failed to set timeout"); close_watchdog(); return -1; } if (timeout != fire_timeout) { log_error("/dev/watchdog failed to set new timeout"); close_watchdog(); return -1; } out: log_error("/dev/watchdog armed with fire_timeout %d", fire_timeout); return 0; } static void pet_watchdog(void) { int rv, unused; rv = ioctl(dev_fd, WDIOC_KEEPALIVE, &unused); last_keepalive = monotime(); log_debug("keepalive %d", rv); } static void process_signals(int ci) { struct signalfd_siginfo fdsi; ssize_t rv; int fd = client[ci].fd; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) { return; } if (fdsi.ssi_signo == SIGTERM) { if (!active_clients()) daemon_quit = 1; } } static int setup_signals(void) { sigset_t mask; int fd, rv; sigemptyset(&mask); sigaddset(&mask, SIGTERM); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; fd = signalfd(-1, &mask, 0); if (fd < 0) return -errno; client_add(fd, process_signals, client_pid_dead); return 0; } static int test_loop(void) { void (*workfn) (int ci); void (*deadfn) (int ci); uint64_t test_time; int poll_timeout; int sleep_seconds; int fail_count; int rv, i; pet_watchdog(); test_time = 0; poll_timeout = test_interval * 1000; while (1) { rv = poll(pollfd, client_maxi + 1, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) { /* not sure */ } for (i = 0; i <= client_maxi; i++) { if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; if (workfn) workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { deadfn = client[i].deadfn; if (deadfn) deadfn(i); } } if (daemon_quit && !active_clients()) break; if (monotime() - test_time >= test_interval) { test_time = monotime(); log_debug("test_time %llu", (unsigned long long)test_time); fail_count = 0; fail_count += test_files(); fail_count += test_scripts(); fail_count += test_clients(); if (!fail_count) pet_watchdog(); } sleep_seconds = test_time + test_interval - monotime(); poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 1; log_debug("sleep_seconds %d", sleep_seconds); } return 0; } static int lockfile(void) { char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; old_umask = umask(0022); rv = mkdir(WDMD_RUN_DIR, 0777); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); sprintf(lockfile_path, "%s/wdmd.pid", WDMD_RUN_DIR); fd = open(lockfile_path, O_CREAT|O_WRONLY|O_CLOEXEC, 0666); if (fd < 0) { log_error("lockfile open error %s: %s", lockfile_path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", lockfile_path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", lockfile_path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", lockfile_path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } static void setup_priority(void) { struct sched_param sched_param; int rv; if (!high_priority) return; rv = mlockall(MCL_CURRENT | MCL_FUTURE); if (rv < 0) { log_error("mlockall failed"); } rv = sched_get_priority_max(SCHED_RR); if (rv < 0) { log_error("could not get max scheduler priority err %d", errno); return; } sched_param.sched_priority = rv; rv = sched_setscheduler(0, SCHED_RR|SCHED_RESET_ON_FORK, &sched_param); if (rv < 0) { log_error("could not set RR|RESET_ON_FORK priority %d err %d", sched_param.sched_priority, errno); } } static int group_to_gid(char *arg) { struct group *gr; gr = getgrnam(arg); if (gr == NULL) { log_error("group '%s' not found, " "using uid: %i", arg, DEFAULT_SOCKET_GID); return DEFAULT_SOCKET_GID; } return gr->gr_gid; } static void print_usage_and_exit(int status) { printf("Usage:\n"); printf("wdmd [options]\n\n"); printf("--version, -V print version\n"); printf("--help, -h print usage\n"); printf("-D debug: no fork and print all logging to stderr\n"); printf("-H use high priority features (1 yes, 0 no, default %d)\n", DEFAULT_HIGH_PRIORITY); printf("-G group ownership for the socket\n"); exit(status); } static void print_version_and_exit(void) { printf("wdmd version %s tests_built%s%s%s\n", RELEASE_VERSION, scripts_built ? scripts_built : "", client_built ? client_built : "", files_built ? files_built : ""); exit(0); } /* If wdmd exits abnormally, /dev/watchdog will eventually fire, and clients can detect wdmd is gone and begin to shut down cleanly ahead of the reset. But what if wdmd is restarted before the wd fires? It will begin petting /dev/watchdog again, leaving the previous clients unprotected. I don't know if this situation is important enough to try to prevent. One way would be for wdmd to fail starting if it found a pid file left over from its previous run. */ int main(int argc, char *argv[]) { int rv; /* * TODO: * -c enable test clients (1 yes, 0 no, default ...) * -s enable test scripts (1 yes, 0 no, default ...) * -f enable test files (1 yes, 0 no, default ...) */ while (1) { int c; int option_index = 0; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"version", no_argument, 0, 'V' }, {0, 0, 0, 0 } }; c = getopt_long(argc, argv, "hVDH:G:", long_options, &option_index); if (c == -1) break; switch (c) { case 'h': print_usage_and_exit(0); break; case 'V': print_version_and_exit(); break; case 'D': daemon_debug = 1; break; case 'G': socket_gid = group_to_gid(optarg); break; case 'H': high_priority = atoi(optarg); break; } } if (!daemon_debug) { if (daemon(0, 0) < 0) { fprintf(stderr, "cannot fork daemon\n"); exit(EXIT_FAILURE); } umask(0); } openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); log_error("wdmd started tests_built%s%s%s\n", scripts_built ? scripts_built : "", client_built ? client_built : "", files_built ? files_built : ""); setup_priority(); rv = lockfile(); if (rv < 0) goto out; rv = setup_signals(); if (rv < 0) goto out_lockfile; rv = setup_scripts(); if (rv < 0) goto out_lockfile; rv = setup_files(); if (rv < 0) goto out_scripts; rv = setup_clients(); if (rv < 0) goto out_files; rv = setup_watchdog(); if (rv < 0) goto out_clients; rv = test_loop(); close_watchdog(); out_clients: close_clients(); out_files: close_files(); out_scripts: close_scripts(); out_lockfile: unlink(lockfile_path); out: return rv; } sanlock-2.2/wdmd/wdmd.h0000644000175100017510000000124711751766670014066 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __WDMD_H__ #define __WDMD_H__ #define WDMD_NAME_SIZE 128 int wdmd_connect(void); int wdmd_register(int con, char *name); int wdmd_refcount_set(int con); int wdmd_refcount_clear(int con); int wdmd_test_live(int con, uint64_t renewal_time, uint64_t expire_time); int wdmd_status(int con, int *test_interval, int *fire_timeout, uint64_t *last_keepalive); #endif sanlock-2.2/tests/0000755000175100017510000000000011751766670013165 5ustar weberwebersanlock-2.2/tests/ruth.py0000755000175100017510000002706011751766670014531 0ustar weberweber#!/usr/bin/python # Copyright 2009 Red Hat, Inc. and/or its affiliates. # # Licensed to you under the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. See the files README and # LICENSE_GPL_v2 which accompany this distribution. # import sys from optparse import OptionParser import os from ConfigParser import ConfigParser import logging from copy import deepcopy import unittest as ut import confUtils from confUtils import Validate from testRunner import TestRunner #To use the same instance as everyone else import ruth LOG_LEVELS = {'d': logging.DEBUG, 'i': logging.INFO, 'w': logging.WARNING, 'e': logging.ERROR, 'c': logging.CRITICAL, 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL} DEFAULT_CONFIG_PATH = "~/.ruthrc" USAGE = '''usage: %%prog [options] [conf1 [conf2 [...] ] ] Loads the configuration from '%s', unless other 'config' files are specified in command line. For more help read \"README.1st\".''' % (DEFAULT_CONFIG_PATH) CONFIG_TEMPLATE = { "global" : { "verbose" : {"default" : 1, "validator" : Validate.int}, "modules" : {"validator" : Validate.list}, } } def generateTemplateFromSuite(suite): """ Generate a config template from a test suit. """ templates = [] for testCase in suite: if hasattr(testCase, "getConfigTemplate"): templates.append(testCase.getConfigTemplate()) return confUtils.mergeTemplates(templates) def validateSuiteConfig(suite, cfg): """ Validate a config against a suite. Validates that all the test cases in the suite have all the options they need in the configuration file. To be used by most ruth modules. :returns: a touple of ``(result, message)``. """ masterTemplate = generateTemplateFromSuite(suite) cfg = _expandGlobalCfg(masterTemplate, cfg) return confUtils.validateConfigFile(masterTemplate, cfg) def _expandGlobalCfg(template, cfg): """ Distribute the options defined in 'global' to all the sections. :returns: a new config file with the global section distributed to all the sections defined in the template. """ #Work on a copy cfg = deepcopy(cfg) #Do we even need to work if not cfg.has_section("global"): return cfg for section in template: if not cfg.has_section(section): cfg.add_section(section) for option in template[section]: if not cfg.has_option("global", option): continue globalValue = cfg.get("global", option) cfg.set(section, option, globalValue) return cfg class RuthTestCase(ut.TestCase): mycfg = property(lambda self: self.cfg[self.__class__.__name__]) def _getConfig(self): """ Manages the configuration wrapper of a test case """ a = self.__class__ if not hasattr(self, "_confDict"): template = self.getConfigTemplate() expandedCfg = _expandGlobalCfg(template, self._cfg) confDict = confUtils.conf2dict(template, expandedCfg) setattr(self, "_confDict", confDict) return self._confDict def _getLog(self): if (not hasattr(self, "_log")) or self._log == None: self._log = logging.getLogger("test." + self.id()) return self._log def _setLog(self, value): self._log = value log = property(lambda self: self._getLog(), lambda self, value: self._setLog(value)) #Dynamic because the base class get the conf directly from Ruth. #If it were static everyone would have the basic classes wrapper. cfg = property(lambda self : self._getConfig()) @classmethod def getConfigTemplate(cls): """ Returns a config template that announces what the test case expects from the config file. .. note:: Should be overrided by subclasses. """ return {} @classmethod def setConfig(cls, cfg): cls._cfg = cfg def setUp(self): pass def tearDown(self): pass def parseArguments(): """ Prepares the options parser and parses the cmd line args. """ usage = USAGE parser = OptionParser(usage=usage) #Prepare generation configuration option parser.add_option("-g", "--generate-configuration", action="store", dest="moduleToGenerate", type="string", metavar="MODULE", default=None, help="Instead of running the suite. Creates a sample configuration from MODULE)") #prepare quiet option parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="Should I bother you with unnecessary niceties. (Hello message and end quote).") #prepare verbose option parser.add_option("-v", action="count", dest="verbosity", default=0, help="Override configurations' verbose level.") #prepare filter option parser.add_option("-f", "--filter-tests", action="store", dest="filter", default="*", metavar="GLOB", help="Only tests that match this glob filter will run." + \ "Using '^' in the beginning of the glob means: Match opposite of GLOB.") #prepare debug option parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="Should I print a lot of output in case of internal errors.") #prepare log option parser.add_option("-l", "--logging", action="store", dest="logLevel", default=None, metavar="LEVEL", help="Turn on test logging of the level LEVEL") #parse args options, args = parser.parse_args() if len(args) == 0: args = [DEFAULT_CONFIG_PATH] return (options, args) def generateSampleConfigFile(defaultModule, suite, targetFile): #Generate template template = generateTemplateFromSuite(suite) #Add default module if not "global" in template: template["global"] = {} globalSection = template["global"] if not "modules" in globalSection: globalSection["modules"] = {} template["global"]["modules"]["default"] = defaultModule #Write it all to disk confUtils.generateSampleConfigFile(template, targetFile) def handleSampleConfigFileGeneration(moduleToGenerate, targetFile): """ Takes care of sample config generation. :param moduleToGenerate: The name of the python module. Should be the same as in and :keyword:`import` statement. :param targetFile: The path to where to sample config file will be generated. :returns: **0** if successful, **400** on import error or **500** on config generation error. """ #Import module try: print "Importing module '%s'..." % (moduleToGenerate) suiteModule = __import__(moduleToGenerate) except Exception, ex: print "Could not import module '%s'. (%s)" % (moduleToGenerate, ex) return 400 #Get suite and generate config file try: print "Generating sample config file at '%s'..." % (targetFile) generateSampleConfigFile(moduleToGenerate, suiteModule.suite(), targetFile) except Exception, ex: print "Could not generate sample config file from module '%s'. (%s: %s)" % (moduleToGenerate, ex.__class__.__name__, ex) return 500 return 0 def _printHeader(header, marker="="): sep = marker * len(header) print sep print header print sep def runBatch(confFile, options): """ Run a batch test as stated in a config file. """ # Try to load config file mycfg = {} batchcfg = {} output = sys.stdout try: output.write("Validating configuration file '%s'.\n" % (os.path.split(confFile)[1])) output.flush() confUtils.validateConfigFile(CONFIG_TEMPLATE, confFile) output.write("Loading RUTH configuration.\n") batchcfg = ConfigParser() batchcfg.read(confFile) mycfg = confUtils.conf2dict(CONFIG_TEMPLATE, batchcfg) except Exception, ex: raise Exception("Could not load config file '%s'. Bailing out from batch. (%s: %s)" % (confFile, ex.__class__.__name__, ex)) #Get modules to test modules = mycfg["global"]["modules"] output.write("Running tests from modules: %s.\n" % (", ".join(modules))) output.flush() #test modules batch = {} for mod in modules[:]: #import module imported_module = __import__(mod) try: if hasattr(imported_module, "validateConfig"): imported_module.validateConfig(batchcfg) else: validateSuiteConfig(imported_module.suite(), batchcfg) batch[mod] = imported_module output.write("Module '%s' is READY\n" % (mod)) except Exception, ex: output.write("Module '%s' is NOT READY (%s: %s)\n" % (mod, ex.__class__.__name__, ex)) modules.remove(mod) #set configuration ruth.RuthTestCase.setConfig(batchcfg) results = [] #run tests for mod in batch: output.write("Exercising module '%s'\n" % mod) output.flush() suite = batch[mod].suite() verbose = mycfg["global"]["verbose"] if options.verbosity > 0: verbose = options.verbosity logging = True if options.logLevel is None: logging = False results.append(TestRunner(verbosity=verbose, stream=output, filter=options.filter, logging=logging).run(suite)) return results def main(): hello = """ Hello, nice to meet you. I am RUTH - "Regression and Unit Test Harness". I am going to run a comprehensive test suite in order to validate vdsm functionality. However, I may require some assistance from you in order to correctly bootstrap the whole procedure. Use --help to see what you can do with me. """ options, args = parseArguments() if options.logLevel is None: logging.basicConfig(filename='/dev/null') else: if not options.logLevel in LOG_LEVELS: print "Invalid logging level, possible values are %s." % ", ".join(options.keys()) return logging.basicConfig(filename='/dev/stdout', filemode='w+',level=LOG_LEVELS[options.logLevel], format="\t\t%(asctime)s %(levelname)-8s%(message)s", datefmt='%H:%M:%S') if not options.quiet: print hello if options.moduleToGenerate: return handleSampleConfigFileGeneration(options.moduleToGenerate, args[0]) #iterate config files and run their tests configFiles = args i = 0 results = [] isMultipleConfigMode = len(configFiles) > 1 for confFile in configFiles: i += 1 if isMultipleConfigMode: _printHeader("Processing batch %d of %d. Configuration is '%s'." % (i, len(configFiles), os.path.split(confFile)[1])) try: results.extend(runBatch(os.path.expanduser(confFile), options)) except Exception, ex: if options.debug: import traceback print traceback.format_exc() print ex if isMultipleConfigMode: totalFailures = sum([len(result.failures) for result in results]) totalErrors = sum([len(result.errors) for result in results]) _printHeader("Totals: Failures %d, Errors %d." % (totalFailures, totalErrors)) if not options.quiet: print 'All Done!\nremember:\n\t"To Err is Human, To Test is Divine!"' if __name__ == '__main__': main() sanlock-2.2/tests/syncManagerTests.py0000644000175100017510000001577411751766670017047 0ustar weberweberimport unittest as ut import time from ruth import RuthTestCase import syncManager from syncManager import SyncManager from confUtils import Validate from testUtils import LeaderRecord, readState, nullTerminated, leasesValidator, getResources from testUtils import Dummy DEFAULT_NUMBER_OF_HOSTS = 10 MAXIMUM_NUMBER_OF_HOSTS = 10 #2000 DEFAULT_NAME = "RUTH" DEFAULT_LEASES = "::[:], ..." LEASES_CONFIG_DEFINITION = {"validator": leasesValidator, "default" : DEFAULT_LEASES} SYNCMANAGER_PATH="../sync_manager" syncManager.SYNCMANAGER_PATH = SYNCMANAGER_PATH class DriveInitialization(RuthTestCase): @classmethod def getConfigTemplate(cls): return { cls.__name__ : { "Leases" : LEASES_CONFIG_DEFINITION, "NumberOfHosts" : {"validator" : Validate.int, "default" : DEFAULT_NUMBER_OF_HOSTS} } } def test(self): mgr = SyncManager(DEFAULT_NAME) leases = self.mycfg["Leases"] mgr.initStorage(leases, self.mycfg["NumberOfHosts"], MAXIMUM_NUMBER_OF_HOSTS) for lease, drives in leases: for drive, offset in drives: with open(drive, "rb") as f: f.seek(offset) leader, blocks = readState(f, MAXIMUM_NUMBER_OF_HOSTS) self.assertEquals(nullTerminated(leader.resourceID), lease) self.assertEquals(leader.numHosts, self.mycfg["NumberOfHosts"]) self.assertEquals(leader.maxHosts, MAXIMUM_NUMBER_OF_HOSTS) for block in blocks: self.assertEquals(block.bal, 0) self.assertEquals(block.mbal, 0) self.assertEquals(block.inp, 0) self.assertEquals(block.lver, 0) class InitPerformanceTest(RuthTestCase): @classmethod def getConfigTemplate(cls): return { cls.__name__ : { "AcceptableTimeSpan" : {"validator" : Validate.float, "default" : 60.0}, "Leases" : LEASES_CONFIG_DEFINITION, "NumberOfHosts" : {"validator" : Validate.int, "default" : DEFAULT_NUMBER_OF_HOSTS} } } def test(self): mgr = SyncManager(DEFAULT_NAME) start = time.time() mgr.initStorage(self.mycfg["Leases"], self.mycfg["NumberOfHosts"], MAXIMUM_NUMBER_OF_HOSTS) end = time.time() self.assertTrue((end - start) <= self.mycfg["AcceptableTimeSpan"]) class AcquireLease(RuthTestCase): @classmethod def getConfigTemplate(cls): return { cls.__name__ : { "Leases" : LEASES_CONFIG_DEFINITION, "NumberOfHosts" : {"validator" : Validate.int, "default" : DEFAULT_NUMBER_OF_HOSTS} } } def setUp(self): self.mgr = SyncManager(DEFAULT_NAME) self.log.debug("Initializing disks") self.mgr.initStorage(self.mycfg["Leases"], self.mycfg["NumberOfHosts"], MAXIMUM_NUMBER_OF_HOSTS) self.log.debug("Starting Dummy Process") self.dummy = Dummy(DEFAULT_NAME, 1) def testGood(self): self.log.debug("Acquiring leases") self.mgr.acquireLeases(self.mycfg["Leases"]) self.mgr.releaseLeases(getResources(self.mycfg["Leases"])) def testWithBadDrive(self): self.log.debug("Acquiring leases") # Adding fake lease leases = list(self.mycfg["Leases"]) + [("Sense-Sphere", [("./disk.fake", 0)])] self.assertRaises(Exception, self.mgr.acquireLeases, leases); def tearDown(self): self.dummy.stop() class ReleaseLease(RuthTestCase): @classmethod def getConfigTemplate(cls): return { cls.__name__ : { "Leases" : LEASES_CONFIG_DEFINITION, "NumberOfHosts" : {"validator" : Validate.int, "default" : DEFAULT_NUMBER_OF_HOSTS} } } def setUp(self): self.mgr = SyncManager(DEFAULT_NAME) self.log.debug("Initializing disks") self.mgr.initStorage(self.mycfg["Leases"], self.mycfg["NumberOfHosts"], MAXIMUM_NUMBER_OF_HOSTS) self.log.debug("Starting Dummy Process") self.dummy = Dummy(DEFAULT_NAME, 1) self.log.debug("Acquiring leases") self.mgr.acquireLeases(self.mycfg["Leases"]) def testGood(self): self.mgr.releaseLeases(getResources(self.mycfg["Leases"])) def testUnacquired(self): resources = getResources(self.mycfg["Leases"]) self.assertRaises(Exception, self.mgr.releaseLeases, resources + ["Sense-Sphere"]) self.mgr.releaseLeases(resources) def tearDown(self): self.dummy.stop() class InitialLeasesTests(RuthTestCase): @classmethod def getConfigTemplate(cls): return { cls.__name__ : { "Leases" : LEASES_CONFIG_DEFINITION, "NumberOfHosts" : {"validator" : Validate.int, "default" : DEFAULT_NUMBER_OF_HOSTS} } } def setUp(self): self.mgr = SyncManager(DEFAULT_NAME) self.log.debug("Initializing disks") self.mgr.initStorage(self.mycfg["Leases"], self.mycfg["NumberOfHosts"], MAXIMUM_NUMBER_OF_HOSTS) def acquireInitialLeases(self): self.dummy = Dummy(DEFAULT_NAME, 1, self.mycfg["Leases"]) self.mgr.releaseLeases(getResources(self.mycfg["Leases"])) def acquireInitialLeasesWithoutHostID(self): try: self.dummy = Dummy(DEFAULT_NAME, -1, self.mycfg["Leases"]) except: return self.fail("Managed to start sync_manager daemon without a host ID") def acquireLeasesFromDaemonizedSyncManagerWithoutSettingHostID(self): self.dummy = Dummy(DEFAULT_NAME) self.assertRaises(Exception, self.mgr.acquireLeases, self.mycfg["Leases"]) def acquireLeasesFromDaemonizedSyncManagerAfterSettingHostID(self): self.dummy = Dummy(DEFAULT_NAME) self.mgr.setHostID(1); self.mgr.acquireLeases(self.mycfg["Leases"]) def resetHostID(self): self.dummy = Dummy(DEFAULT_NAME) self.mgr.setHostID(1); self.assertRaises(Exception, self.mgr.setHostID, 2); self.mgr.acquireLeases(self.mycfg["Leases"]) def tearDown(self): if hasattr(self, "dummy"): self.dummy.stop() def suite(): tests = { DriveInitialization : ["test"], InitPerformanceTest : ["test"], AcquireLease : ["testGood", "testWithBadDrive"], ReleaseLease : ["testGood", "testUnacquired"], InitialLeasesTests : ["acquireInitialLeases", "acquireInitialLeasesWithoutHostID", "acquireLeasesFromDaemonizedSyncManagerWithoutSettingHostID", "acquireLeasesFromDaemonizedSyncManagerAfterSettingHostID", "resetHostID"] } resSuite = ut.TestSuite() for testcase, methods in tests.iteritems(): resSuite.addTests(map(testcase, methods)) return resSuite sanlock-2.2/tests/sanlk_string.c0000644000175100017510000000634011751766670016032 0ustar weberweber#include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" void print_res(struct sanlk_resource *res) { int i; printf("struct fields: \"%s\" \"%s\"", res->lockspace_name, res->name); for (i = 0; i < res->num_disks; i++) { printf(" \"%s\" %llu", res->disks[i].path, (unsigned long long)res->disks[i].offset); } printf(" flags %x", res->flags); printf(" lver %llu\n", (unsigned long long)res->lver); } int main(int argc, char *argv[]) { struct sanlk_lockspace ls; struct sanlk_resource *res; struct sanlk_resource **res_args = NULL; char *state; int res_count; int rv, i; if (argc < 2) { printf("%s RESOURCE RESOURCE ...\n", argv[0]); printf("%s -s LOCKSPACE\n", argv[0]); return 0; } if (!strcmp(argv[1], "-s")) { memset(&ls, 0, sizeof(ls)); rv = sanlock_str_to_lockspace(argv[2], &ls); printf("struct fields: \"%s\" %llu %u \"%s\" %llu\n", ls.name, (unsigned long long)ls.host_id, ls.flags, ls.host_id_disk.path, (unsigned long long)ls.host_id_disk.offset); return rv; } state = malloc(1024 * 1024); memset(state, 0, 1024 * 1024); printf("\n"); printf("sanlock_str_to_res for each argv\n", rv); printf("--------------------------------------------------------------------------------\n"); for (i = 1; i < argc; i++) { rv = sanlock_str_to_res(argv[i], &res); print_res(res); free(res); res = NULL; if (i > 1) strcat(state, " "); strcat(state, argv[i]); } printf("\n"); printf("combined argv input for state_to_args\n"); printf("--------------------------------------------------------------------------------\n"); printf("\"%s\"\n", state); rv = sanlock_state_to_args(state, &res_count, &res_args); printf("\n"); printf("sanlock_state_to_args %d res_count %d\n", rv, res_count); printf("--------------------------------------------------------------------------------\n"); for (i = 0; i < res_count; i++) { res = res_args[i]; print_res(res); } free(state); state = NULL; rv = sanlock_args_to_state(res_count, res_args, &state); printf("\n"); printf("sanlock_args_to_state %d\n", rv); printf("--------------------------------------------------------------------------------\n"); printf("\"%s\"\n", state); return 0; } #if 0 [root@bull-02 tests]# ./res_string 'LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0' 'LB:R2:/dev/bar:11' sanlock_str_to_res for each argv -------------------------------------------------------------------------------- struct fields: "LA" "R1" "/dev/foo1:xx" 0 "/dev/foo2:yy" 0 0 struct fields: "LB" "R2" "/dev/bar" 11 0 combined argv input for state_to_args -------------------------------------------------------------------------------- "LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0 LB:R2:/dev/bar:11" sanlock_state_to_args 0 res_count 2 -------------------------------------------------------------------------------- struct fields: "LA" "R1" "/dev/foo1:xx" 0 "/dev/foo2:yy" 0 0 struct fields: "LB" "R2" "/dev/bar" 11 0 sanlock_args_to_state 0 -------------------------------------------------------------------------------- "LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0:0 LB:R2:/dev/bar:11:0" #endif sanlock-2.2/tests/Makefile0000644000175100017510000000133311751766670014625 0ustar weberweberTARGET = devcount TARGET2 = sanlk_load SOURCE = devcount.c SOURCE2 = sanlk_load.c CFLAGS += -D_GNU_SOURCE -g \ -Wall \ -Wformat \ -Wformat-security \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option LDFLAGS = -lrt -laio -lblkid -lsanlock all: $(TARGET) $(TARGET2) $(TARGET): $(SOURCE) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE) -o $@ -L. -L../src $(TARGET2): $(SOURCE2) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE2) -o $@ -L. -L../src clean: rm -f *.o *.so *.so.* $(TARGET) $(TARGET2) sanlock-2.2/tests/confUtils.py0000644000175100017510000002557411751766670015522 0ustar weberweber""" ConfUtils is a general purpose configuration infrastructure. This module contains various classes and functions to help with the creation of a structured and robust configuration for your application. ConfUtils extends the general functionality of python's idea of a configuration file and uses the same file format for saving configurations. Thus making it's configuration 100% compatible with python's simpler configuration parsers and utilities. The main difference is that ConfUtils treats sections and options as **case sensitive** while python's own config parsers are generally **case insensitive**. Configuration Templates ======================= ConfigUtils uses Configuration Templates as the basis of all of it's advanced functions. Configuration Template is a way of representing what you expect the configuration to look like and how you want to use it. A template is a specially crafted mishmash of python dictionaries. This is an example of a configuration template: #The template is a dict configurationTemplate = { #Each key is a section with another dict as the value "Section1" : { #Each key in the sub dict is an option, and a value is a dict containing the option's metadata. "Option1" : {"default" : "Default Value", "comment" : "Comment", "validator" : Validate.int} "Option2" : {} # Note that if you don't want to set any restrictions you still need to supply an empty dict. } "Section2" : { "Option3" : {"default" : "Bob"} # You can optionally fill in only a subset of the metadata. } } This template validates this config: [Section1] Option1 = Bill Option2 = 3 [Section2] Option3 = Ted Option Meta Data ---------------- Every option can have added attributes that define it. * default - The default value of this option. If the config is missing this option this value will be used. * comment - Used when generating a sample configuration. If this exists a comment above the option will be written. * validator - A method that validates that the value in the configuration is correct. This can be any method that: 1. Accepts 1 argument. 2. Raises an exception in case validation fails. 3. Return the value as a python native type """ from ConfigParser import ConfigParser, RawConfigParser import os class AdvancedConfigParser(RawConfigParser): """ A configuration parser that supports the advance features of ConfUtils. Specifically case sensitivity and writing comments. """ def __init__(self): RawConfigParser.__init__(self) self._comments = {} def set_option_comment(self, section, option, comment): """ Set the comment that will appear if the config is written to a file. """ if not self.has_option(section, option): raise KeyError("No such option '%s.%s'." %(section, option)) if not section in self._comments: self._comments[section] = {} self._comments[section][option] = comment def optionxform(self, option): """ Changes the behaviour so that it keeps the case of the option. """ return option def write(self, fileobject): """ Write the config file to an object **including** comments """ comments = self._comments for section in self.sections(): #write section fileobject.write("[%s]\n" % section) for option in self.options(section): hasComment = (section in comments and option in comments[section] and comments[section][option] != None) if hasComment: comment = comments[section][option] comment = "#" + "\n#".join(comment.splitlines()) fileobject.write(comment + "\n") value = str(self.get(section, option)) # If option contains multiple lines if "\n" in value: value = "\n\t".join(value.splitlines()) fileobject.write("%s: %s\n" % (option, value)) else: fileobject.write("%s = %s\n" % (option, value)) #pad section fileobject.write("\n\n") class TemplateMergeError(RuntimeError) : pass class ConfigurateionValidationError(RuntimeError) : pass def mergeTemplates(templates): """ A logical way to merege template. .. note:: Templates a merged in the way they were recieved. .. warning:: In any option arg conflict the new will override the old. :param templates: a list of templates to merge. """ finalTemplate = {} for template in templates: for section, options in template.iteritems(): if not section in finalTemplate: finalTemplate[section] = {} for option, args in options.iteritems(): if not option in finalTemplate[section]: finalTemplate[section][option] = args elif finalTemplate[section][option] != args: raise TemplateMergeError("Option '%s.%s' exists in two templates but doesn't have the same definition." % (section, option)) return finalTemplate class Validate(object): """ A class with common validators. """ #TBD: make thread safe? _innerConfig = ConfigParser() @classmethod def _genericGetValue(cls, methodName, value): innerConfig = cls._innerConfig if not innerConfig.has_section("tmp"): innerConfig.add_section("tmp") innerConfig.set("tmp", "tmp", value) validationMethod = getattr(innerConfig, methodName) return validationMethod("tmp", "tmp") @classmethod def int(cls, value): if isinstance(value, int): return value return cls._genericGetValue("getint", value) @classmethod def bool(cls, value): if isinstance(value, bool): return value return cls._genericGetValue("getboolean", value) @classmethod def float(cls, value): if isinstance(value, float): return value return cls._genericGetValue("getfloat", value) @classmethod def list(cls, value): if isinstance(value, list): return value return [i.strip() for i in value.split(",")] @classmethod def dict(cls, value): if isinstance(value, dict): return value value = value.strip() if not (value.startswith("{") and value.endswith("}")): raise ValueError("String doesn't represent a dict.") res = eval(value) if not isinstance(res, dict): raise ValueError("String doesn't represent a dict.") return res @classmethod def pathExists(cls, value): if os.path.exists(value): return value raise ValueError("Path doesn't exist.") def generateSampleConfigFile(template, targetFile): """ Generates a sample config file from a template. :param template: A config template. :param tergetfile: A file path or a writable file-like object. """ cfg = AdvancedConfigParser() if not isinstance(template, dict): raise TypeError("Template must be a dict") for section, options in template.iteritems(): #Create the section cfg.add_section(section) #Compile the options if not isinstance(options, dict): raise TypeError("Template options must be a dict") for option, args in options.iteritems(): if not isinstance(args, dict): raise TypeError("Options metadata must be a dict") defaultValue = "" if args.has_key("default"): defaultValue = args["default"] cfg.set(section, option, defaultValue) if "comment" in args: cfg.set_option_comment(section, option, args["comment"]) # Write the generated config file if type(targetFile) in (str, unicode): cfg.write(open(targetFile, "w")) elif hasattr(targetFile, "write"): targetFile.write(targetFile) else: raise TypeError("targetFile: Expected a path or a file-like object") def validateConfigFile(template, cfg): """ Validate that config file conforms with template. :param cfg: The path to the config file or a :class:`~ConfigParser.ConfigParser` instance. :param template: A config template. :returns: A touple in the format of ``(result, message)``. *result* will be :keyword:`True` if validation was seccessful. """ #Make sure cfg is a config object. if type(cfg) in (str, unicode): if not os.path.exists(cfg): raise ConfigurateionValidationError("File '%s' doesn't exist." % cfg) path = cfg cfg = ConfigParser() cfg.read(path) elif not isinstance(cfg, RawConfigParser): raise TypeError("Parameter 'cfg' must be a path or a config object") #Test if sections exist for section, options in template.iteritems(): if not cfg.has_section(section): raise ConfigurateionValidationError("Section %s is missing." % section) #Validate that options exist and are valid. for option, args in options.iteritems(): hasDefaultValue = ("default" in args) exists = cfg.has_option(section, option) if not exists and not hasDefaultValue: raise ConfigurateionValidationError("Option %s.%s is missing." % (section, option)) if exists: optionValue = cfg.get(section, option) else: optionValue = args["default"] if args.has_key("validator"): try: args["validator"](optionValue) except Exception, ex: raise ConfigurateionValidationError("Parsing of option %s.%s with the value '%s' failed (%s: %s)." % (section, option, optionValue, ex.__class__.__name__, ex)) return True def conf2dict(template, cfg): """ Converts a config file to a dict using the template to convert types from strings to native data types. .. note:: * Assumes template is validated. * Extracts only the field declared in the templates. """ outputDict = {} for section, options in template.iteritems(): outputDict[section] = {} for option, args in options.iteritems(): if cfg.has_option(section, option): rawOptionValue = cfg.get(section, option) elif "default" in args: rawOptionValue = args["default"] hasValidator = ("validator" in args) if hasValidator: outputDict[section][option] = args["validator"](rawOptionValue) else: outputDict[section][option] = rawOptionValue return outputDict sanlock-2.2/tests/testUtils.py0000644000175100017510000001233211751766670015540 0ustar weberweberfrom struct import Struct from functools import partial from collections import namedtuple from confUtils import Validate import new import signal import subprocess import logging from select import select from threading import Thread, Event import re import os import pwd import time def _makeFromStream(ntClass, struct, cls, stream): size = struct.size buf = stream.read(size) if len(buf) < size: raise RuntimeError("Stream is not long enough") return _makeFromBuffer(ntClass, struct, cls, buf) def _makeFromBuffer(ntClass, struct, cls, buffer): return ntClass._make(struct.unpack(buffer)) def aligneStruct(struct, blockSize=512): return Struct("%s%dx" % (struct.format, (blockSize - (struct.size % blockSize)))) dblockStruct = aligneStruct(Struct("QQQQ")) DBlock = namedtuple("DBlock", "mbal bal inp lver") DBlock.fromStream = new.instancemethod(partial(_makeFromStream, DBlock, dblockStruct), DBlock, DBlock.__class__) DBlock.fromBuffer = new.instancemethod(partial(_makeFromBuffer, DBlock, dblockStruct), DBlock, DBlock.__class__) leaderRecordStruct = aligneStruct(Struct("III4xQQQQ32sQI4x")) LeaderRecord = namedtuple('LeaderRecord', 'magic version clusterMode numHosts maxHosts ownerID lver resourceID timestamp checksum') LeaderRecord.fromStream = new.instancemethod(partial(_makeFromStream, LeaderRecord, leaderRecordStruct), LeaderRecord, LeaderRecord.__class__) LeaderRecord.fromBuffer = new.instancemethod(partial(_makeFromBuffer, LeaderRecord, leaderRecordStruct), LeaderRecord, LeaderRecord.__class__) def leasesValidator(value): rawLeases = Validate.list(value) leases = [] for lease in rawLeases: parts = lease.split(":") resourceID = parts[0] disks = [] for i in range(1, len(parts), 2): disks.append((parts[i], int(parts[i + 1]))) leases.append((resourceID, tuple(disks))) return tuple(leases) getResources = lambda leases : [resource for resource, disks in leases] nullTerminated = lambda str : str[:str.find("\0")] def readState(stream, numOfHosts = 0): lrSize = leaderRecordStruct.size leader = LeaderRecord.fromStream(stream) if numOfHosts < 1: numOfHosts = leader.numHosts dblockSize = dblockStruct.size totalSize = dblockSize * numOfHosts buf = stream.read(totalSize) if len(buf) < totalSize: raise RuntimeError("Stream is not long enough") dblocks = [] for start in range(0, totalSize, dblockSize): minibuf = buf[start: (start + dblockSize)] dblocks.append(DBlock.fromBuffer(minibuf)) return (leader, tuple(dblocks)) #DUMMY_CMD = ["/usr/bin/sudo", "-u", pwd.getpwuid(os.geteuid())[0], os.path.abspath("./dummy.py")] DUMMY_CMD = [os.path.abspath("./dummy.py")] class Dummy(object): _log = logging.getLogger("Dummy"); _pidRegex = re.compile(r".*supervise_pid\s+(\d+).*") def __init__(self, name, hostID = -1, leases = []): cmd = ["sudo", "-n", "../sync_manager", "daemon", "-D", "-n", name, "-i", str(hostID)] cmd.extend(self._compileLeaseArgs(leases)) cmd.append("-c") cmd.extend(DUMMY_CMD) self._log.debug("CMD: %s" % subprocess.list2cmdline(cmd)) self.process = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE ) self._wrappedPid = 0 self._pidStarted = Event() self._logThread = Thread(target = self._logOutputThread) self._logThread.start() self._pidStarted.wait() #Wait for dummy to set up time.sleep(1) if self._wrappedPid == 0: raise Exception("Probelm running dummy") def _logOutputThread(self): while self.process.poll() is None: readyObjects = select([self.process.stdout, self.process.stderr], [], [], 1)[0] for obj in readyObjects: line = obj.readline().replace("\n", "") if line == "": continue if self._wrappedPid == 0: m = self._pidRegex.match(line) if m: self._wrappedPid = int(m.groups()[0]) self._pidStarted.set() self._log.debug("Daemon - %s" % line) self._pidStarted.set() def _compileLeaseArgs(self, leases): args = [] for lease, disks in leases: mangledDisks = ["%s:%d" % (os.path.abspath(disk), offset) for (disk, offset) in disks] args.extend(["-l", "%s:%s" % (lease, ":".join(mangledDisks))]) return args def stop(self): if not self.process.poll() is None: return self._log.debug("Stopping dummy") os.kill(self._wrappedPid, signal.SIGUSR1) try: self.process.wait() except OSError, ex: if ex.errno != 10: raise self._logThread.join() def __del__(self): self.stop() if __name__ == "__main__": with open("drive.img" , "rb") as f: t = LeaderRecord.fromStream(f) print t.tokenName with open("drive.img" , "rb") as f: l = readState(f, 200) print len(l) sanlock-2.2/tests/devcount.c0000644000175100017510000010454011751766670015164 0ustar weberweber#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #define ONEMB 1048576 #define LEASE_SIZE ONEMB FILE *turn_file; char count_path[PATH_MAX]; char lock_path[PATH_MAX]; int count_offset; int lock_offset; int our_hostid; int max_hostid; struct sanlk_lockspace lockspace; struct entry { uint32_t turn; uint32_t hostid; uint64_t pid; uint64_t time; uint64_t count; uint32_t last_turn; uint32_t last_hostid; uint64_t last_pid; uint64_t last_time; uint64_t last_count; }; #define log_debug(fmt, args...) \ do { \ printf("%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ printf("ERROR %llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) static void sigterm_handler(int sig) { log_debug("sigterm_handler %d", sig); } static void setup_sigterm(void) { struct sigaction act; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); } static int kill_pid(int pid) { int rv, status; kill(pid, SIGKILL); while (1) { rv = waitpid(pid, &status, 0); if (rv < 0) return -1; if (rv != pid) return -2; if (WIFEXITED(status)) return 0; } } /* kill(pid, SIGSTOP) would be nice, but that won't guarantee the pid has finished all i/o when it returns. Instead, we send SIGUSR1, which child sees after it's done with a (synchronous) write, and calls SIGSTOP on itself */ static void pause_pid(int pid, int child_stderr) { char buf[64]; int rv; kill(pid, SIGUSR1); /* child prints "we_are_paused" to stderr before stopping */ memset(buf, 0, sizeof(buf)); rv = read(child_stderr, buf, sizeof(buf)); if (strstr(buf, "we_are_paused")) return; while (1) { log_error("pause_pid %d read %s", pid, buf); sleep(2); } } static void resume_pid(int pid) { kill(pid, SIGCONT); } static int check_pause(int fd) { struct signalfd_siginfo fdsi; ssize_t rv; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) { return 0; } if (fdsi.ssi_signo == SIGUSR1) { return 1; } return 0; } static int setup_pause(void) { sigset_t mask; int fd, rv; sigemptyset(&mask); sigaddset(&mask, SIGUSR1); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; fd = signalfd(-1, &mask, SFD_NONBLOCK); if (fd < 0) return -errno; return fd; } static int rand_int(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } /* 64 byte entry: can fit up to 8 nodes in a 512 byte block */ void print_entries(char *path, int pid, char *buf) { struct entry *e = (struct entry *)buf; int i; for (i = 0; i < (512 / sizeof(struct entry)); i++) { log_error("%s c %d index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu", path, pid, i, e->turn, (unsigned long long)e->time, e->hostid, (unsigned long long)e->pid, (unsigned long long)e->count, e->last_turn, (unsigned long long)e->last_time, e->last_hostid, (unsigned long long)e->last_pid, (unsigned long long)e->last_count); e++; } } void print_our_we(char *path, int pid, int writes, struct entry *our_we, const char *stage) { log_debug("%s c %d %s w %d index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu", path, pid, stage, writes, our_hostid - 1, our_we->turn, (unsigned long long)our_we->time, our_we->hostid, (unsigned long long)our_we->pid, (unsigned long long)our_we->count, our_we->last_turn, (unsigned long long)our_we->last_time, our_we->last_hostid, (unsigned long long)our_we->last_pid, (unsigned long long)our_we->last_count); } #define COUNT_ARGS 6 #define LOCK_ARGS 8 #define MIGRATE_ARGS 9 /* * devcount rw|wr */ static int do_count(int argc, char *argv[]) { char *rbuf, **p_rbuf, *wbuf, **p_wbuf, *vbuf, **p_vbuf; struct entry *re, *max_re, *our_we; int i, fd, rv, error, max_i; int pause_fd; time_t start; uint32_t our_pid = getpid(); uint32_t max_turn; int sec1, sec2; int read_seconds, write_seconds; uint32_t writes = 0; if (argc < COUNT_ARGS) return -1; pause_fd = setup_pause(); strcpy(count_path, argv[2]); sec1 = atoi(argv[3]); sec2 = atoi(argv[4]); our_hostid = atoi(argv[5]); if (!strcmp(argv[1], "rw")) { read_seconds = sec1; write_seconds = sec2; } else { write_seconds = sec1; read_seconds = sec2; } /* printf("%d %s count_disk %s sec1 %d sec2 %d our_hostid %d\n", our_pid, argv[1], count_path, sec1, sec2, our_hostid); */ fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); error = 1; goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); error = 2; goto fail; } p_rbuf = &rbuf; p_wbuf = &wbuf; p_vbuf = &vbuf; rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 3; goto fail; } rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 4; goto fail; } rv = posix_memalign((void *)p_vbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 5; goto fail; } lseek(fd, count_offset, SEEK_SET); rv = read(fd, rbuf, 512); if (rv != 512) { perror("read failed"); error = 6; goto fail; } /* print_entries(our_pid, rbuf); */ /* * reading for "rw" */ if (!strcmp(argv[1], "rw")) { for (i = 0; i < read_seconds; i++) { sleep(1); lseek(fd, count_offset, SEEK_SET); rv = read(fd, vbuf, 512); if (rv != 512) { perror("read failed"); error = 7; goto fail; } if (memcmp(rbuf, vbuf, 512)) { log_error("%s c %d rbuf:", count_path, our_pid); print_entries(count_path, our_pid, rbuf); log_error("%s c %d vbuf:", count_path, our_pid); print_entries(count_path, our_pid, vbuf); error = 8; goto fail; } } } /* * writing */ re = (struct entry *)rbuf; max_re = NULL; max_i = 0; max_turn = 0; for (i = 0; i < (512 / sizeof(struct entry)); i++) { if (!max_re || re->count > max_re->count) { max_re = re; max_i = i; } if (!max_turn || re->turn > max_turn) max_turn = re->turn; re++; } if (max_turn != max_re->turn) { log_error("%s c %d max_turn %d max_re->turn %d\n", count_path, our_pid, max_turn, max_re->turn); error = 9; goto fail; } /* printf("%d max index %d turn %d count %llu\n", our_pid, max_i, max_turn, (unsigned long long)max_re->count); */ memcpy(wbuf, rbuf, 512); our_we = (struct entry *)(wbuf + ((our_hostid - 1) * sizeof(struct entry))); our_we->last_turn = max_re->turn; our_we->last_hostid = max_re->hostid; our_we->last_pid = max_re->pid; our_we->last_time = max_re->time; our_we->last_count = max_re->count; our_we->turn = max_re->turn + 1; our_we->hostid = our_hostid; our_we->pid = our_pid; our_we->time = time(NULL); our_we->count = max_re->count + 1; lseek(fd, count_offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); error = 10; goto fail; } writes = 1; print_our_we(count_path, our_pid, writes, our_we, "begin"); start = time(NULL); while (1) { our_we->count++; our_we->time = time(NULL); lseek(fd, count_offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); error = 11; goto fail; } writes++; if (write_seconds && (our_we->time - start >= write_seconds)) break; if (!(writes % 64) && check_pause(pause_fd)) { print_our_we(count_path, our_pid, writes, our_we, "pause"); fprintf(stderr, "we_are_paused\n"); raise(SIGSTOP); /* this shouldn't appear until parent does kill(SIGCONT) */ print_our_we(count_path, our_pid, writes, our_we, "resume"); } } print_our_we(count_path, our_pid, writes, our_we, "end"); if (turn_file) { fprintf(turn_file, "turn %03u start %llu end %llu host %u pid %u\n", our_we->turn, (unsigned long long)(max_re->count + 1), (unsigned long long)our_we->count, our_hostid, our_pid); fflush(turn_file); fclose(turn_file); } /* * reading for "wr" */ if (!strcmp(argv[1], "wr")) { memcpy(rbuf, wbuf, 512); for (i = 0; i < read_seconds; i++) { sleep(1); lseek(fd, count_offset, SEEK_SET); rv = read(fd, vbuf, 512); if (rv != 512) { perror("read failed"); error = 12; goto fail; } if (memcmp(rbuf, vbuf, 512)) { log_error("%s c %d rbuf:", count_path, our_pid); print_entries(count_path, our_pid, rbuf); log_error("%s c %d vbuf:", count_path, our_pid); print_entries(count_path, our_pid, vbuf); error = 13; goto fail; } } } return 0; fail: fprintf(stderr, "error %d\n", error); while (1) { log_error("%s c %d error %d", count_path, our_pid, error); print_entries(count_path, our_pid, rbuf); print_entries(count_path, our_pid, vbuf); sleep(2); } } static int add_lockspace(void) { int rv; strcpy(lockspace.name, "devcount"); strcpy(lockspace.host_id_disk.path, lock_path); lockspace.host_id_disk.offset = lock_offset; lockspace.host_id = our_hostid; rv = sanlock_add_lockspace(&lockspace, 0); log_debug("%s p %d sanlock_add_lockspace %d", lock_path, getpid(), rv); return rv; } /* * Test inquire and acquire with version * * lock: * acquire (no lver) * if fail * goto lock; * else * goto run; * * relock: * acquire with saved lver * if fail (others may acquire in lock:) * sigkill pid; * goto lock; * else * sigcont pid; * goto run; * * run: * run rw for a while * inquire pid * save lver * sigstop pid * release ALL * goto relock * */ static int do_relock(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res, *res_inq; int i, j, pid, rv, sock, len, status; int c2p[2]; /* child to parent */ int res_count; uint32_t parent_pid = getpid(); uint64_t lver; char *state; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = relock * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } res->flags = 0; res->lver = 0; rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); rv = sanlock_restrict(sock, SANLK_RESTRICT_ALL); if (rv < 0) { log_error("%s c %d sanlock_restrict error %d", count_path, child_pid, sock); exit(-1); } /* make child's stderr go to parent c2p[0] */ close(2); dup(c2p[1]); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } run_more: /* let the child run for 30 seconds before stopping it */ for (i = 0; i < 30; i++) { rv = waitpid(pid, &status, WNOHANG); if (rv == pid) break; sleep(1); } /* we expect child to exit when it fails to acquire the lock because it's held by someone else, or rw run time is up */ if (rv == pid) goto dead_child; rv = sanlock_inquire(-1, pid, 0, &res_count, &state); if (rv == -EBUSY) { /* pid probably still busy doing acquire */ goto run_more; } if (rv == -ESTALE || rv == -ESRCH) { /* pid has exited */ goto run_more; } if (rv < 0) { log_error("%s p %d sanlock_inquire c %d error %d", count_path, parent_pid, pid, rv); goto run_more; } rv = sanlock_str_to_res(state, &res_inq); if (rv < 0) { log_error("%s p %d sanlock_str_to_res error %d %s", count_path, parent_pid, rv, state); goto fail; } lver = res_inq->lver; log_debug("%s p %d sanlock_inquire c %d lver %llu done", count_path, parent_pid, pid, (unsigned long long)lver); free(res_inq); free(state); pause_pid(pid, c2p[0]); log_debug("%s p %d paused c %d", count_path, parent_pid, pid); rv = sanlock_release(-1, pid, SANLK_REL_ALL, 0, NULL); if (rv < 0) { /* pid may have exited */ log_error("%s p %d sanlock_release c %d error %d", count_path, parent_pid, pid, rv); goto kill_child; } log_debug("%s p %d sanlock_release c %d done", count_path, parent_pid, pid); /* give a chance to someone else to acquire the lock in here */ usleep(1000000); res->flags = SANLK_RES_LVER; res->lver = lver; rv = sanlock_acquire(-1, pid, 0, 1, &res, NULL); if (!rv) { /* we got the lock back in the same version */ log_debug("%s p %d sanlock_acquire c %d lver %llu done", count_path, parent_pid, pid, (unsigned long long)lver); resume_pid(pid); goto run_more; } /* someone got the lock between our release and reacquire */ log_debug("%s p %d sanlock_acquire c %d lver %llu error %d", count_path, parent_pid, pid, (unsigned long long)lver, rv); kill_child: kill_pid(pid); log_debug("%s p %d killed c %d", count_path, parent_pid, pid); dead_child: close(c2p[0]); close(c2p[1]); sleep(rand_int(0, 1)); } fail: printf("test failed...\n"); sleep(1000000); return -1; } /* * devcount lock rw * sanlock add_lockspace -s devcount:::0 * devcount rw */ static int do_lock(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; int i, j, pid, rv, sock, len, status; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = lock * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } waitpid(pid, &status, 0); /* TODO: goto fail if exit status is an error */ sleep(rand_int(0, 1)); } printf("test failed...\n"); sleep(1000000); return -1; } static int do_wrap(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; int i, j, rv, sock, len; uint32_t pid = getpid(); if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = wrap * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, pid, sock); exit(-1); } rv = sanlock_restrict(sock, SANLK_RESTRICT_SIGKILL); if (rv < 0) { log_error("%s c %d sanlock_restrict error %d", count_path, pid, sock); exit(-1); } rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_error("%s c %d sanlock_acquire error %d", count_path, pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, pid); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } /* * Test migration sequence (source inquires/releases, dest acquires lver) * * dest forks (e.g. libvirtd creates qemu pid) * dest child does sanlock_register, waits for parent (e.g. qemu incoming paused) * source parent does sanlock_inquire * source parent sigstop child, sanlock_release, writes state to disk * dest parent reads state from disk, sanlock_acquire(child_pid, state.lver) * dest parent tells child to run (e.g. qemu incoming resumed) * dest child execs rw * source parent sigkill child */ static void write_migrate_incoming(char *state_in) { char target_str[32]; char state[1024]; char *wbuf, **p_wbuf; int fd, rv; int offset = 4096; int target; target = (our_hostid % max_hostid) + 1; memset(state, 0, sizeof(state)); memset(target_str, 0, sizeof(target_str)); sprintf(target_str, " target=%d", target); strcat(state, state_in); strcat(state, target_str); if (strlen(state) > 512) { printf("state string too long\n"); goto fail; } fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); goto fail; } p_wbuf = &wbuf; rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } memset(wbuf, 0, 512); memcpy(wbuf, state, strlen(state)); lseek(fd, offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); goto fail; } /* printf("write_migrate_incoming \"%s\"\n", wbuf); */ close(fd); return; fail: printf("write_migrate %d failed %s\n", offset, state); sleep(10000000); } /* read incoming block until it's set and our_hostid is next */ static int wait_migrate_incoming(uint64_t *lver) { struct sanlk_resource *res; char *rbuf, **p_rbuf, *wbuf, **p_wbuf; char *target_str, *val_str; int fd, rv, val; int offset = 4096; fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); goto fail; } p_rbuf = &rbuf; p_wbuf = &wbuf; rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } retry: lseek(fd, offset, SEEK_SET); rv = read(fd, rbuf, 512); if (rv != 512) { perror("read failed"); goto fail; } rbuf[511] = '\0'; /* init case to get things going */ if (!rbuf[0] && our_hostid == 1) { *lver = 0; return 1; } target_str = strstr(rbuf, " target="); if (!target_str) { goto retry; } val_str = strstr(target_str, "=") + 1; if (!val_str) { goto retry; } val = atoi(val_str); if (val != our_hostid) { goto retry; } /* printf("wait_migrate_incoming \"%s\"\n", rbuf); */ *target_str = '\0'; rv = sanlock_str_to_res(rbuf, &res); if (rv < 0) { printf("str_to_res error %d\n", rv); goto fail; } *lver = res->lver; free(res); /* strcpy(state_out, rbuf); */ memset(wbuf, 0, 512); sprintf(wbuf, "%s", "empty"); lseek(fd, offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); goto fail; } close(fd); return 0; fail: printf("wait_migrate_incoming failed\n"); sleep(10000000); return -1; } #define MAX_MIGRATE_STATE 512 /* keep in one block for simplicity */ static int do_migrate(int argc, char *argv[]) { char *av[MIGRATE_ARGS+1]; struct sanlk_resource *res; int i, j, pid, rv, sock, len, init; int p2c[2]; /* parent to child */ int c2p[2]; /* child to parent */ int res_count; uint32_t parent_pid = getpid(); uint64_t lver; char *state; if (argc < MIGRATE_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); max_hostid = atoi(argv[8]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = migrate * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < MIGRATE_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(p2c); pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); char junk; sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } log_debug("%s c %d wait", count_path, child_pid); read(p2c[0], &junk, 1); close(p2c[0]); close(p2c[1]); log_debug("%s c %d begin", count_path, child_pid); /* make child's stderr go to parent c2p[0] */ close(2); dup(c2p[1]); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } init = wait_migrate_incoming(&lver); /* from source */ if (init) { res->flags = 0; res->lver = 0; } else { res->flags = SANLK_RES_LVER; res->lver = lver; } rv = sanlock_acquire(-1, pid, 0, 1, &res, NULL); if (rv < 0) { log_error("%s p %d sanlock_acquire c %d error %d", count_path, parent_pid, pid, rv); exit(0); } log_debug("%s p %d sanlock_acquire c %d init %d lver %llu done", count_path, parent_pid, pid, init, (unsigned long long)lver); /* tell child to resume */ write(p2c[1], "\n", 1); close(p2c[0]); close(p2c[1]); /* let the child run for 10 seconds before stopping it; if the child exits before the 10 seconds, the sanlock_inquire call should return an error */ sleep(10); rv = sanlock_inquire(-1, pid, 0, &res_count, &state); if (rv < 0) { log_error("%s p %d sanlock_inquire c %d error %d", count_path, parent_pid, pid, rv); goto fail; } log_debug("%s p %d sanlock_inquire c %d done", count_path, parent_pid, pid); pause_pid(pid, c2p[0]); log_debug("%s p %d paused c %d", count_path, parent_pid, pid); rv = sanlock_release(-1, pid, SANLK_REL_ALL, 0, NULL); if (rv < 0) { log_error("%s p %d sanlock_release c %d error %d", count_path, parent_pid, pid, rv); goto fail; } log_debug("%s p %d sanlock_release c %d done", count_path, parent_pid, pid); write_migrate_incoming(state); /* to dest */ kill_pid(pid); log_debug("%s p %d killed c %d", count_path, parent_pid, pid); close(c2p[0]); close(c2p[1]); free(state); } fail: printf("test failed...\n"); sleep(10000000); return -1; } /* * dmsetup table /dev/bull/lock1 > /tmp/table-linear.txt * sed "s/linear/error/" /tmp/table-linear.txt > /tmp/table-error.txt * * dmsetup suspend /dev/bull/lock1 * dmsetup load /dev/bull/lock1 /tmp/table-error.txt * dmsetup resume /dev/bull/lock1 * * dmsetup suspend /dev/bull/lock1 * dmsetup load /dev/bull/lock1 /tmp/table-linear.txt * dmsetup resume /dev/bull/lock1 */ static void dmsetup_save_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup save %s", lock_path); system(cmd); } static void dmsetup_error_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup error %s", lock_path); system(cmd); } static void dmsetup_linear_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup linear %s", lock_path); system(cmd); } int do_expire(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; uint32_t parent_pid = getpid(); int i, j, pid, rv, sock, len, status; int c2p[2]; char result[5]; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); dmsetup_save_lock_disk(); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = expire * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } /* this acquire can take up to 90 seconds waiting for the host_id of the owner to time out */ log_debug("%s c %d sanlock_acquire begin", count_path, child_pid); rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ write(c2p[1], "fail", 4); close(c2p[0]); close(c2p[1]); exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); write(c2p[1], "good", 4); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } memset(&result, 0, sizeof(result)); read(c2p[0], &result, 4); close(c2p[0]); close(c2p[1]); if (strstr(result, "fail")) { /* we expect child to exit when it fails to acquire the lock because it's held by someone else */ waitpid(pid, &status, 0); goto dead_child; } /* this test should be run with sec2 set to some large value that won't run out before sanlock daemon kills rw */ sleep(rand_int(6, 100)); dmsetup_error_lock_disk(); log_debug("%s p %d disable %s", count_path, parent_pid, lock_path); /* sanlock daemon kills pid when the renewals fail; after the kill it will try to release the resource lease, which will also fail if the resource lease is on the same disk as the host_id lease. Other nodes trying to get pid's resource lease are watching our host_id for 90 seconds, after which they will take pid's resource lease. If the resource lease is on a different disk, the daemon will be able to release it after the kill, and another node will be able to take it immediately after that, without watching our host_id for 90 seconds */ /* other nodes can't rely on the daemon being able to kill rw, so they need to wait 90 seconds to ensure that the watchdog has killed the host before taking pid's resource lease. In a different test, have the daemon kill fail, causing rw to continue running until the watchdog fires, after which another host will take pid's resource lease */ waitpid(pid, &status, 0); log_debug("%s p %d waitpid c %d done", count_path, parent_pid, pid); sleep(rand_int(0, 3)); dmsetup_linear_lock_disk(); log_debug("%s p %d enable %s", count_path, parent_pid, lock_path); log_debug("%s p %d sanlock_add_lockspace begin", lock_path, parent_pid); while (1) { sleep(1); rv = add_lockspace(); if (!rv) break; } dead_child: sleep(rand_int(0, 1)); } printf("test failed...\n"); sleep(1000000); return -1; } /* * devcount init * sanlock direct init -n 8 -s devcount:0::0 * sanlock direct init -n 8 -r devcount:resource::LEASE_SIZE * dd if=/dev/zero of= bs=512 count=24 */ #define INIT_NUM_HOSTS 0 int do_init(int argc, char *argv[]) { char resbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_disk disk; struct sanlk_resource *res; struct sanlk_lockspace ls; char command[4096]; int rv, align_size; if (argc < 4) return -1; strcpy(count_path, argv[3]); #if 0 /* initialize host_id lease area at offset 0 */ memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "sanlock direct init -s devcount:0:%s:0", argv[2]); printf("%s\n", command); system(command); /* initialize first resource lease area at offset LEASE_SIZE */ memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "sanlock direct init -r devcount:resource%s:%s:%d", argv[3], argv[2], LEASE_SIZE); printf("%s\n", command); system(command); #else memset(&disk, 0, sizeof(disk)); strcpy(disk.path, argv[2]); align_size = sanlock_direct_align(&disk); if (align_size != LEASE_SIZE) { printf("sanlock_direct align %s error %d\n", disk.path, align_size); return -1; } memset(&ls, 0, sizeof(ls)); strcpy(ls.name, "devcount"); strcpy(ls.host_id_disk.path, argv[2]); printf("init sync\n"); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init lockspace error %d\n", rv); return -1; } printf("init async\n"); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 1); if (rv < 0) { printf("sanlock_direct_init lockspace error %d\n", rv); return -1; } memset(resbuf, 0, sizeof(resbuf)); res = (struct sanlk_resource *)&resbuf; strcpy(res->lockspace_name, "devcount"); sprintf(res->name, "resource%s", argv[3]); res->num_disks = 1; strcpy(res->disks[0].path, argv[2]); res->disks[0].offset = LEASE_SIZE; rv = sanlock_direct_init(NULL, res, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init resource error %d\n", rv); return -1; } #endif memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "dd if=/dev/zero of=%s bs=512 count=24", count_path); printf("%s\n", command); system(command); return 0; } int main(int argc, char *argv[]) { int rv; if (argc < 2) goto out; if (!strcmp(argv[1], "init")) rv = do_init(argc, argv); else if (!strcmp(argv[1], "rw") || !strcmp(argv[1], "wr")) rv = do_count(argc, argv); else if (!strcmp(argv[1], "rwsig")) { setup_sigterm(); argv[1] = "rw"; rv = do_count(argc, argv); } else if (!strcmp(argv[1], "lock")) rv = do_lock(argc, argv); else if (!strcmp(argv[1], "wrap")) rv = do_wrap(argc, argv); else if (!strcmp(argv[1], "relock")) rv = do_relock(argc, argv); else if (!strcmp(argv[1], "migrate")) rv = do_migrate(argc, argv); else if (!strcmp(argv[1], "expire")) rv = do_expire(argc, argv); if (!rv) return 0; out: /* * sanlock direct init -s devcount:0:/dev/bull/leases:0 * sanlock direct init -r devcount:resource/dev/bull/count:/dev/bull/leases:LEASE_SIZE * * host_id leases exists at offset 0 * first resource lease exists at offset LEASE_SIZE */ printf("devcount init \n"); printf(" sanlock direct init -s devcount:0::0\n"); printf(" sanlock direct init -r devcount:resource::LEASE_SIZE\n"); printf(" dd if=/dev/zero of= bs=512 count=24\n"); printf("\n"); printf("devcount rw \n"); printf(" rw: read count for sec1, looking for writes, then write for sec2\n"); printf(" wr: write count for sec1, then read for sec2, looking for writes\n"); printf("\n"); printf("devcount lock rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount relock rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf(" sigstop child, inquire, release, re-acquire, sigcont|sigkill\n"); printf("\n"); printf("devcount wrap rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount migrate rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount expire rw \n"); printf("\n"); return -1; } sanlock-2.2/tests/enum.py0000644000175100017510000000236011751766670014504 0ustar weberweberclass Enum(object): """ A nice class to handle Enums gracefullly. """ def __init__(self, **pairs): #Generate reverse dict self._reverse = dict([(b, a) for a, b in pairs.iteritems()]) #Generate attributes for key, value in pairs.iteritems(): setattr(self, key, value) def __getitem__(self, index): return self._reverse[index] def __iter__(self): return self._reverse.itervalues() def parse(self, value): #If value is enum name convert to value if isinstance(value, str): if hasattr(self, value): return getattr(self, value) #If value is a number assume parsing meant converting the value to int #if you can think of a more generic way feel free to change if value.isdigit(): value = int(value) #If not check if value is a value of the enum if value in self._reverse: return value #Enum doesn't know this value raise ValueError("Value '%s' is not in the Enum." % value) if __name__ == "__main__": eColors = Enum( Red = 1, Blue = 2 ) print eColors.Red, eColors.Blue, eColors[1] sanlock-2.2/tests/syncManager.py0000777000175100017510000000000011751766670022573 2../../python/syncManager.pyustar weberwebersanlock-2.2/tests/paxosState.py0000755000175100017510000000256711751766670015707 0ustar weberweber#!/usr/bin/python from testUtils import readState, DBlock, nullTerminated from StringIO import StringIO import time import sys USAGE = "usage: paxosState.py : [:]" def formatPaxoState(disk, offset): with open(disk, "rb") as f: f.seek(offset) leader, dblocks = readState(f) res = StringIO() res.write("LEADER\n------\n") for key in leader._fields: val = getattr(leader, key) if key == "timestamp": val = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(val)) elif isinstance(val, str): val = nullTerminated(val) res.write("%s:\t%s%s\n" % (key, '\t' if len(key) < 7 else '', val)) res.write("\nBLOCKS\n------\n") for field in DBlock._fields: res.write("%s:" % field) for dblock in dblocks: res.write("\t%s" % getattr(dblock, field)) res.write("\n") res.seek(0) return res.read() if __name__ == "__main__": if len(sys.argv) < 2: print USAGE sys.exit(1) disks = [] try: for arg in sys.argv[1:]: disk, offset = arg.split(":") offset = int(offset) disks.append((disk, offset)) except: print USAGE sys.exit(1) for disk, offset in disks: print "**** %s:%d ****" % (disk, offset) print formatPaxoState(disk, offset) sanlock-2.2/tests/devcount-dmsetup0000755000175100017510000000077311751766670016430 0ustar weberweber#!/bin/bash if [ $# -le 1 ]; then echo "num $#" echo "" echo "devcount-dmsetup save " echo "" echo "devcount-dmsetup error " echo "" echo "devcount-dmsetup linear " echo "" fi cmd=$1 dev=$2 if [ "$cmd" == "save" ]; then rm -f /tmp/table-linear.txt rm -f /tmp/table-error.txt dmsetup table $dev > /tmp/table-linear.txt sed "s/linear/error/" /tmp/table-linear.txt > /tmp/table-error.txt exit 0 fi dmsetup suspend $dev dmsetup load $dev /tmp/table-$cmd.txt dmsetup resume $dev sanlock-2.2/tests/devcountn0000755000175100017510000000437211751766670015126 0ustar weberweber#!/bin/bash if [ $# -le 3 ]; then echo "" echo "Start N devcount commands" echo "" echo "devcountn N init LOCKDEV_BASE COUNTDEV_BASE" echo "devcountn N rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N lock LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N relock LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N wrap LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N migrate LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID MAXID" echo "devcountn N expire LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "" echo "devcount LOCKDEV1 rw COUNTDEV1 ..." echo "devcount LOCKDEV2 rw COUNTDEV2 ..." echo "devcount LOCKDEV3 rw COUNTDEV3 ..." echo ... echo "devcount LOCKDEVN rw COUNTDEVN ..." echo "" echo "Examples" echo "" echo "devcountn 3 init /dev/lock /dev/count" echo " devcount init /dev/lock1 /dev/count1" echo " devcount init /dev/lock2 /dev/count2" echo " devcount init /dev/lock3 /dev/count3" echo "" echo "devcountn 3 rw /dev/count 5 5 1" echo " devcount rw /dev/count1 5 5 1" echo " devcount rw /dev/count2 5 5 1" echo " devcount rw /dev/count3 5 5 1" echo "" echo "devcountn 3 lock /dev/lock rw /dev/count 5 5 1" echo " sanlock add_lockspace -s devcount:1:/dev/lock1:0" echo " (the add_lockspace command from each subsequent devcount will fail)" echo " devcount lock /dev/lock1 rw /dev/count1 5 5 1" echo " devcount lock /dev/lock2 rw /dev/count2 5 5 1" echo " devcount lock /dev/lock3 rw /dev/count3 5 5 1" echo "" exit 0 fi num=$1 cmd1=$2 if [ "$cmd1" != "init" ]; then deva=$3 cmd2=$4 devb=$5 sec1=$6 sec2=$7 hostid=$8 maxid=$9 i=1 echo sanlock add_lockspace -s devcount:$hostid:$deva$i:0 sanlock add_lockspace -s devcount:$hostid:$deva$i:0 fi end=`expr $num - 1` for i in `seq 0 $end`; do if [ "$cmd1" == "init" ]; then deva=$3 devb=$4 echo ./devcount init $deva$i $devb$i ./devcount init $deva$i $devb$i elif [ "$cmd1" == "rw" ] || [ "$cmd1" == "wr" ]; then echo ./devcount $cmd1 $deva$i $sec1 $sec2 $hostid ./devcount $cmd1 $deva$i $sec1 $sec2 $hostid & else echo ./devcount $cmd1 $deva$i $cmd2 $devb$i $sec1 $sec2 $hostid $maxid ./devcount $cmd1 $deva$i $cmd2 $devb$i $sec1 $sec2 $hostid $maxid & fi done sanlock-2.2/tests/sanlk_load.c0000644000175100017510000004272411751766670015451 0ustar weberweber#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #define ONEMB 1048576 #define LEASE_SIZE ONEMB #define MAX_LS_COUNT 64 #define MAX_RES_COUNT 512 #define MAX_PID_COUNT 256 #define DEFAULT_LS_COUNT 4 #define DEFAULT_RES_COUNT 4 #define DEFAULT_PID_COUNT 4 #define MAX_RV 300 #define IV -1 #define UN 0 #define SH 3 #define EX 5 int prog_stop; int debug = 0; int debug_verbose = 0; char error_buf[4096]; char lock_disk_base[PATH_MAX]; int lock_state[MAX_LS_COUNT][MAX_RES_COUNT]; int ls_count = DEFAULT_LS_COUNT; int res_count = DEFAULT_RES_COUNT; int pid_count = DEFAULT_PID_COUNT; int one_mode = 0; int our_hostid; int acquire_rv[MAX_RV]; int release_rv[MAX_RV]; #define log_debug(fmt, args...) \ do { \ if (debug) printf("%lu " fmt "\n", time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ memset(error_buf, 0, sizeof(error_buf)); \ snprintf(error_buf, 4095, "%ld " fmt "\n", time(NULL), ##args); \ printf("ERROR: %s\n", error_buf); \ syslog(LOG_ERR, "%s", error_buf); \ } while (0) static void sigterm_handler(int sig) { if (sig == SIGTERM) prog_stop = 1; } static int get_rand(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } static int get_rand_sh_ex(void) { unsigned int n; if (one_mode == SH) return SH; if (one_mode == EX) return EX; n = (unsigned int)random();; if (n % 2) return SH; return EX; } static void save_rv(int pid, int rv, int acquire) { if (rv > 0) goto fail; if (-rv > MAX_RV) goto fail; if (acquire) { if (!rv) acquire_rv[0]++; else acquire_rv[-rv]++; } else { if (!rv) release_rv[0]++; else release_rv[-rv]++; } return; fail: log_error("%d save_rv %d %d", pid, rv, acquire); while (1) { sleep(10); printf("%lu %d ERROR save_rv %d %d", time(NULL), pid, rv, acquire); } } static void display_rv(int pid) { int i; printf("%lu %d results acquire ", time(NULL), pid); for (i = 0; i < MAX_RV; i++) { if (acquire_rv[i]) printf("%d:%d ", i, acquire_rv[i]); } printf("release "); for (i = 0; i < MAX_RV; i++) { if (release_rv[i]) printf("%d:%d ", i, release_rv[i]); } printf("\n"); } static void dump_lock_state(int pid) { int i, j; for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { if (!lock_state[i][j]) continue; log_error("%d lockspace%d:resource%d", pid, i, j); } } } static void dump_inquire_state(int pid, char *state) { char *p = state; int len = strlen(state); int i; if (!len) return; for (i = 0; i < len; i++) { if (state[i] == ' ') { state[i] = '\0'; if (!i) log_debug("%d leading space", pid); else log_debug("%d %s", pid, p); p = state + i + 1; } } log_debug("%d %s", pid, p); } static int check_lock_state(int pid, int result, int count, char *res_state) { char buf[128]; char *found = NULL; int found_count = 0; int none_count = 0; int bad_count = 0; int i, j; memset(buf, 0, sizeof(buf)); if (result < 0) goto fail; if (!count) { if (res_state) { log_error("%d check_lock_state zero count res_state %s", pid, res_state); } for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { if (lock_state[i][j]) { bad_count++; log_error("%d check_lock_state zero count %d %d lock", pid, i, j); } } } if (bad_count) goto fail; return 0; } for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { memset(buf, 0, sizeof(buf)); sprintf(buf, "lockspace%d:resource%d:", i, j); found = strstr(res_state, buf); if (found && lock_state[i][j]) { found_count++; } else if (!found && !lock_state[i][j]) { none_count++; } else { bad_count++; log_error("%d check_lock_state %s lock_state %d res_state %s", pid, buf, lock_state[i][j], res_state); } } } if ((found_count != count) || bad_count) goto fail; return 0; fail: log_error("%d check_lock_state result %d count %d res_state %s", pid, result, count, res_state); log_error("%d check_lock_state found %d none %d bad %d", pid, found_count, none_count, bad_count); dump_lock_state(pid); while (1) { sleep(10); printf("%lu %d ERROR check_lock_state result %d count %d found %d bad %d res_state %s", time(NULL), pid, result, count, found_count, bad_count, res_state); } } #if 0 static int remove_lockspace(int i) { struct sanlk_lockspace ls; int rv; memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); ls.host_id = our_hostid; printf("rem lockspace%d...\n", i); rv = sanlock_rem_lockspace(&ls, 0); if (rv < 0) { log_error("sanlock_rem_lockspace error %d %s", rv, ls.host_id_disk.path); return -1; } printf("rem done\n"); return 0; } #endif static int add_lockspace(int i) { struct sanlk_lockspace ls; int rv; int async = !(i % 2); uint32_t flags = 0; memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); ls.host_id = our_hostid; if (async) flags = SANLK_ADD_ASYNC; printf("add lockspace%d...\n", i); rv = sanlock_add_lockspace(&ls, flags); if (rv == -EEXIST) return 0; if (rv < 0) { log_error("sanlock_add_lockspace error %d %s", rv, ls.host_id_disk.path); return -1; } if (!async) goto out; while (1) { rv = sanlock_inq_lockspace(&ls, 0); if (!rv) goto out; if (rv == -EINPROGRESS) { sleep(2); continue; } log_error("sanlock_inq_lockspace error %d", rv); return -1; } out: printf("add done\n"); return 0; } static int add_lockspaces(void) { int i, rv; for (i = 0; i < ls_count; i++) { rv = add_lockspace(i); if (rv < 0) return rv; } return 0; } static const char *mode_str(int n) { if (n == SH) return "sh"; if (n == EX) return "ex"; if (n == UN) return "un"; if (n == IV) return "iv"; return "er"; } static int do_one(int pid, int fd, int _s1, int _r1, int _n1, int *full) { char buf1[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *r1; int acquire = (_n1 != UN); int rv; memset(buf1, 0, sizeof(buf1)); r1 = (struct sanlk_resource *)&buf1; sprintf(r1->lockspace_name, "lockspace%d", _s1); sprintf(r1->name, "resource%d", _r1); sprintf(r1->disks[0].path, "%s%d", lock_disk_base, _s1); r1->disks[0].offset = (_r1+1)*LEASE_SIZE; r1->num_disks = 1; if (_n1 == SH) r1->flags |= SANLK_RES_SHARED; if (acquire) { rv = sanlock_acquire(fd, -1, 0, 1, &r1, NULL); if (rv == -E2BIG || rv == -ENOENT) *full = 1; } else { rv = sanlock_release(fd, -1, 0, 1, &r1); } log_debug("%d %s %d,%d %s = %d", pid, acquire ? "acquire" : "release", _s1, _r1, mode_str(_n1), rv); save_rv(pid, rv, acquire); return rv; } static int do_two(int pid, int fd, int _s1, int _r1, int _n1, int _s2, int _r2, int _n2, int *full) { char buf1[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; char buf2[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *r1; struct sanlk_resource *r2; struct sanlk_resource **res_args; int acquire = (_n1 != UN); int rv; res_args = malloc(2 * sizeof(struct sanlk_resource *)); if (!res_args) return -ENOMEM; memset(buf1, 0, sizeof(buf1)); memset(buf2, 0, sizeof(buf2)); r1 = (struct sanlk_resource *)&buf1; r2 = (struct sanlk_resource *)&buf2; res_args[0] = r1; res_args[1] = r2; sprintf(r1->lockspace_name, "lockspace%d", _s1); sprintf(r1->name, "resource%d", _r1); sprintf(r1->disks[0].path, "%s%d", lock_disk_base, _s1); r1->disks[0].offset = (_r1+1)*LEASE_SIZE; r1->num_disks = 1; if (_n1 == SH) r1->flags |= SANLK_RES_SHARED; sprintf(r2->lockspace_name, "lockspace%d", _s2); sprintf(r2->name, "resource%d", _r2); sprintf(r2->disks[0].path, "%s%d", lock_disk_base, _s2); r2->disks[0].offset = (_r2+1)*LEASE_SIZE; r2->num_disks = 1; if (_n2 == SH) r2->flags |= SANLK_RES_SHARED; if (acquire) { rv = sanlock_acquire(fd, -1, 0, 2, res_args, NULL); if (rv == -E2BIG || rv == -ENOENT) *full = 1; } else { rv = sanlock_release(fd, -1, 0, 2, res_args); } log_debug("%d %s %d,%d %s %d,%d %s = %d", pid, acquire ? "acquire" : "release", _s1, _r1, mode_str(_n1), _s2, _r2, mode_str(_n2), rv); save_rv(pid, rv, acquire); free(res_args); return rv; } static int acquire_one(int pid, int fd, int s1, int r1, int n1, int *full) { return do_one(pid, fd, s1, r1, n1, full); } static int acquire_two(int pid, int fd, int s1, int r1, int n1, int s2, int r2, int n2, int *full) { return do_two(pid, fd, s1, r1, n1, s2, r2, n2, full); } static int release_one(int pid, int fd, int s1, int r1) { return do_one(pid, fd, s1, r1, UN, NULL); } static int release_two(int pid, int fd, int s1, int r1, int s2, int r2) { return do_two(pid, fd, s1, r1, UN, s2, r2, UN, NULL); } static int release_all(int pid, int fd) { int rv; rv = sanlock_release(fd, -1, SANLK_REL_ALL, 0, NULL); log_debug("%d release all = %d", pid, rv); save_rv(pid, rv, 0); return rv; } static void inquire_all(int pid, int fd) { int rv, count = 0; char *state = NULL; if (prog_stop) return; rv = sanlock_inquire(fd, -1, 0, &count, &state); log_debug("%d inquire all = %d %d", pid, rv, count); if (prog_stop) return; check_lock_state(pid, rv, count, state); if (count && debug_verbose) dump_inquire_state(pid, state); if (state) free(state); } int do_rand_child(void) { int s1, s2, r1, r2, m1, m2, n1, n2, full; int fd, rv; int iter = 1; int pid = getpid(); srandom(pid); memset(lock_state, 0, sizeof(lock_state)); fd = sanlock_register(); if (fd < 0) { log_error("%d sanlock_register error %d", pid, fd); exit(-1); } while (!prog_stop) { s1 = get_rand(0, ls_count-1); r1 = get_rand(0, res_count-1); m1 = lock_state[s1][r1]; s2 = -1; r2 = -1; m2 = IV; if (get_rand(1, 3) == 2) { s2 = get_rand(0, ls_count-1); r2 = get_rand(0, res_count-1); m2 = lock_state[s2][r2]; if (s1 == s2 && r1 == r2) { s2 = -1; r2 = -1; m2 = IV; } } full = 0; if (m1 == UN && m2 == UN) { /* both picks are unlocked, lock both together */ n1 = get_rand_sh_ex(); n2 = get_rand_sh_ex(); rv = acquire_two(pid, fd, s1, r1, n1, s2, r2, n2, &full); if (!rv) { lock_state[s1][r1] = n1; lock_state[s2][r2] = n2; } m1 = IV; m2 = IV; } if (m1 > UN && m2 > UN) { /* both picks are locked, unlock both together */ release_two(pid, fd, s1, r1, s2, r2); lock_state[s1][r1] = UN; lock_state[s2][r2] = UN; m1 = IV; m2 = IV; } if (m1 == UN) { n1 = get_rand_sh_ex(); rv = acquire_one(pid, fd, s1, r1, n1, &full); if (!rv) lock_state[s1][r1] = n1; } if (m2 == UN) { n2 = get_rand_sh_ex(); rv = acquire_one(pid, fd, s2, r2, n2, &full); if (!rv) lock_state[s2][r2] = n2; } if (m1 > UN) { release_one(pid, fd, s1, r1); lock_state[s1][r1] = UN; } if (m2 > UN) { release_one(pid, fd, s2, r2); lock_state[s2][r2] = UN; } if (full) { release_all(pid, fd); memset(lock_state, 0, sizeof(lock_state)); } if ((iter % 10) == 0) { display_rv(pid); inquire_all(pid, fd); } iter++; } display_rv(pid); return 0; } /* * sanlk_load rand -i [-D -s -r -p ] */ void get_options(int argc, char *argv[]) { char optchar; char *optionarg; char *p; int i = 3; for (; i < argc; ) { p = argv[i]; if ((p[0] != '-') || (strlen(p) != 2)) { log_error("unknown option %s", p); log_error("space required before option value"); exit(EXIT_FAILURE); } optchar = p[1]; i++; if (optchar == 'D') { debug = 1; continue; } if (optchar == 'V') { debug_verbose = 1; continue; } if (i >= argc) { log_error("option '%c' requires arg", optchar); exit(EXIT_FAILURE); } optionarg = argv[i]; switch (optchar) { case 'i': our_hostid = atoi(optionarg); break; case 's': ls_count = atoi(optionarg); if (ls_count > MAX_LS_COUNT) { log_error("max ls_count %d", MAX_LS_COUNT); exit(-1); } break; case 'r': res_count = atoi(optionarg); if (res_count > MAX_RES_COUNT) { log_error("max res_count %d", MAX_RES_COUNT); exit(-1); } break; case 'p': pid_count = atoi(optionarg); if (pid_count > MAX_PID_COUNT) { log_error("max pid_count %d", MAX_PID_COUNT); exit(-1); } break; case 'm': one_mode = atoi(optionarg); break; default: log_error("unknown option: %c", optchar); exit(EXIT_FAILURE); } i++; } } int find_pid(int *kids, int pid) { int i; for (i = 0; i < pid_count; i++) { if (kids[i] == pid) return i; } return -1; } int do_rand(int argc, char *argv[]) { struct sigaction act; int children[MAX_PID_COUNT]; int run_count = 0; int i, rv, pid, status; if (argc < 5) return -1; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); strcpy(lock_disk_base, argv[2]); get_options(argc, argv); rv = add_lockspaces(); if (rv < 0) return rv; printf("forking %d pids\n", pid_count); for (i = 0; i < pid_count; i++) { pid = fork(); if (pid < 0) { log_error("fork %d failed %d run_count %d", i, errno, run_count); break; } if (!pid) { do_rand_child(); exit(-1); } children[i] = pid; run_count++; } printf("children running\n"); while (!prog_stop) { /* * kill and replace a random pid */ sleep(get_rand(1, 60)); if (prog_stop) break; i = get_rand(0, pid_count); pid = children[i]; printf("kill pid %d\n", pid); kill(pid, SIGKILL); rv = waitpid(pid, &status, 0); if (rv <= 0) continue; pid = fork(); if (pid < 0) { log_error("fork failed %d", errno); break; } else if (!pid) { do_rand_child(); exit(-1); } else { children[i] = pid; } #if 0 /* * remove a random lockspace, replace any pids that were using * it, replace the lockspace */ sleep(get_rand(1, 60)); if (prog_stop) break; lsi = get_rand(0, ls_count-1); remove_lockspace(lsi); while (1) { rv = waitpid(-1, &status, WNOHANG); if (rv <= 0) break; if (!WIFEXITED(status)) continue; printf("exit pid %d\n", pid); i = find_pid(children, rv); if (i < 0) continue; pid = fork(); if (pid < 0) { log_error("fork failed %d", errno); break; } else if (!pid) { do_rand_child(); exit(-1); } else { children[i] = pid; } } add_lockspace(lsi); #endif } printf("stopping pids"); for (i = 0; i < pid_count; i++) kill(children[i], SIGTERM); while (run_count) { pid = wait(&status); if (pid > 0) { run_count--; printf("."); } } printf("\n"); return 0; } /* * sanlk_load init [ ] * lock_disk_base = /dev/vg/foo * * sanlock direct init -s lockspace0:0:/dev/vg/foo0:0 * sanlock direct init -r lockspace0:resource0:/dev/vg/foo0:1M * sanlock direct init -r lockspace0:resource1:/dev/vg/foo0:2M * ... * sanlock direct init -s lockspace1:0:/dev/vg/foo1:0 * sanlock direct init -r lockspace1:resource0:/dev/vg/foo1:1M * sanlock direct init -r lockspace1:resource1:/dev/vg/foo1:2M * ... */ #define INIT_NUM_HOSTS 64 int do_init(int argc, char *argv[]) { char resbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *res; struct sanlk_lockspace ls; int i, j, rv; if (argc < 3) return -1; strcpy(lock_disk_base, argv[2]); if (argc > 3) ls_count = atoi(argv[3]); if (argc > 4) res_count = atoi(argv[4]); for (i = 0; i < ls_count; i++) { memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 1); if (rv < 0) { printf("sanlock_direct_init lockspace error %d %s\n", rv, ls.host_id_disk.path); return -1; } for (j = 0; j < res_count; j++) { memset(resbuf, 0, sizeof(resbuf)); res = (struct sanlk_resource *)&resbuf; strcpy(res->lockspace_name, ls.name); sprintf(res->name, "resource%d", j); res->num_disks = 1; strcpy(res->disks[0].path, ls.host_id_disk.path); res->disks[0].offset = (j+1)*LEASE_SIZE; rv = sanlock_direct_init(NULL, res, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init resource error %d\n", rv); return -1; } } } return 0; } int main(int argc, char *argv[]) { int rv = -1; if (argc < 2) goto out; if (!strcmp(argv[1], "init")) rv = do_init(argc, argv); else if (!strcmp(argv[1], "rand")) rv = do_rand(argc, argv); if (!rv) return 0; out: printf("sanlk_load init [ ]\n"); printf(" init ls_count lockspaces, each with res_count resources\n"); printf(" devices for lockspaces 0..N are disk_base0..disk_baseN\n"); printf(" e.g. /dev/lock0, /dev/lock1, ... /dev/lockN\n"); printf("\n"); printf("sanlk_load rand -i [options]\n"); printf(" -s number of lockspaces\n"); printf(" -r number of resources per lockspace\n"); printf(" -p number of processes\n"); printf(" -m use one mode for all locks, 3 = SH, 5 = EX\n"); printf(" -D debug output\n"); printf(" -V verbose debug output\n"); printf("\n"); return -1; } sanlock-2.2/tests/testRunner.py0000644000175100017510000001263611751766670015720 0ustar weberweberimport sys from unittest import TestResult from fnmatch import fnmatch import traceback from itertools import chain, ifilter from enum import Enum eColors = Enum( Green = '\033[92m', Yellow = '\033[93m', Red = '\033[91m', ENDC = '\033[0m' ) _faultSeperator = "-" * 80 def _formatTestFault(test, err, faultTypeName): res = "%s\n%s: %s :\n%s\n" % (_faultSeperator, faultTypeName, test.id(), err) return res class _TextTestResult(TestResult): """ A better way to display test results in in the terminal. Assumes correct an linear execution per test. """ def __init__(self, stream, verbosity = 1, logging=False): TestResult.__init__(self) self._stream = stream self._verbosity = verbosity self._logging = logging def _writeToStream(self, msg, color=None): stream = self._stream #Make sure color is a color if color != None: color = eColors.parse(color) writeColor = False try: writeColor = (color != None and stream.isatty()) except AttributeError: #A strem might no implement isatty pass if writeColor: msg = color + msg + eColors.ENDC stream.write(msg) stream.flush() def startTest(self, test): TestResult.startTest(self, test) self._writeToStream("\t%s: " % test.id()) if self._logging: self._writeToStream("\n") def addSuccess(self, test): TestResult.addSuccess(self, test) if self._logging: self._writeToStream("\tResult: ") self._writeToStream("OK", eColors.Green) def addError(self, test, err): testname = test.id().split(".")[-1] tb = err[2] stack = traceback.extract_tb(tb) for frame in stack: fname = frame[2] if fname == testname: if self._logging: self._writeToStream("\tResult: ") self._writeToStream("Test ERROR", eColors.Yellow) break if fname == "setUp": if self._logging: self._writeToStream("\tResult: ") self._writeToStream("SetUp ERROR", eColors.Yellow) break if fname == "tearDown": #If test succeded but tear down failed the result should #still be that the test failed. So it's my resposibility #to display thet only the 'test' part of the test passed. (Confused yet?) faults = chain(self.failures, self.errors) testFaults = ifilter(lambda item: item[0] == test, faults) hasFailed = (sum(1 for u in testFaults) > 0) if not hasFailed: if self._logging: self._writeToStream("\tResult: ") self._writeToStream("PASSED", eColors.Green) self._writeToStream(", ") self._writeToStream("Tear Down ERROR", eColors.Yellow) break TestResult.addError(self, test, err) def addFailure(self, test, err): if self._logging: self._writeToStream("\tResult: ") TestResult.addFailure(self, test, err) self._writeToStream("FAIL", eColors.Red) def stopTest(self, test): TestResult.stopTest(self, test) self._writeToStream("\n") self.printTestErrLog(test, 3) def printTestErrLog(self, test, minVerbosity): if self._verbosity < minVerbosity: return for fTest, err in self.failures: if test == fTest: self._writeToStream( _formatTestFault(test, err, "FAILURE")) for eTest, err in self.errors: if test == eTest: self._writeToStream( _formatTestFault(test, err, "ERROR")) class TestRunner(object): """ A test runner that is better then the default :class:`unittest.TextTestRunner`. Gives prettier output. """ def __init__(self, stream = sys.stderr, verbosity=1, filter="*", logging=False): self._verbosity = verbosity self._stream = stream self._filter = filter self._logging = logging def run(self, suite): """ Run a test. """ stream = self._stream results = _TextTestResult(stream, self._verbosity, self._logging) #Parse filter filter = self._filter filterIfMatchIs = True if filter.startswith("^"): filterIfMatchIs = False filter = filter[1:] filter = filter.replace("\\^", "^") #So you could escape ^. For completeness. filter = filter.replace("\\\\", "\\") for test in suite: if not (fnmatch(test.id(), filter) == filterIfMatchIs): continue test.run(result = results) if results.wasSuccessful(): msg = "All Good!" else: msg = "Failed (failures=%d, errors=%d)." % (len(results.failures), len(results.errors)) sep = "*" * (len(msg) + 4) + "\n" stream.write(sep) stream.write("* " + msg + " *" + "\n") stream.write(sep) if self._verbosity == 2: for test, err in results.failures: stream.write(_formatTestFault(test, err, "FAILURE")) for test, err in results.errors: stream.write(_formatTestFault(test, err, "ERROR")) stream.flush() return results sanlock-2.2/init.d/0000755000175100017510000000000011751766670013210 5ustar weberwebersanlock-2.2/init.d/wdmd0000644000175100017510000000302711751766670014070 0ustar weberweber#!/bin/sh # # wdmd - watchdog multiplexing daemon # # chkconfig: 2345 97 03 # description: starts and stops sanlock daemon # ### BEGIN INIT INFO # Provides: wdmd # Required-Start: $time $syslog # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops wdmd daemon # Description: starts and stops wdmd daemon ### END INIT INFO . /etc/rc.d/init.d/functions prog="wdmd" lockfile="/var/run/$prog/$prog.pid" exec="/usr/sbin/$prog" WDMDGROUP="sanlock" WDMDOPTS="-G $WDMDGROUP" [ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog start() { [ -x $exec ] || exit 5 if [ ! -d /var/run/$prog ]; then mkdir -p /var/run/$prog [ -x /sbin/restorecon ] && restorecon /var/run/$prog fi echo -n $"Starting $prog: " daemon $prog $WDMDOPTS retval=$? echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { echo -n $"Stopping $prog: " killproc -p $lockfile $prog -TERM retval=$? echo [ $retval -eq 0 ] && rm -f $lockfile } restart() { stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? sanlock-2.2/init.d/wdmd.service0000644000175100017510000000031611751766670015525 0ustar weberweberDescription=Watchdog Multiplexing Daemon After=syslog.target [Service] Type=forking ExecStart=/lib/systemd/systemd-wdmd start ExecStop=/lib/systemd/systemd-wdmd stop [Install] WantedBy=multi-user.target sanlock-2.2/init.d/sanlock.service0000644000175100017510000000034111751766670016222 0ustar weberweberDescription=Shared Storage Lease Manager After=syslog.target wdmd.service [Service] Type=forking ExecStart=/lib/systemd/systemd-sanlock start ExecStop=/lib/systemd/systemd-sanlock stop [Install] WantedBy=multi-user.target sanlock-2.2/init.d/sanlock0000644000175100017510000000325611751766670014573 0ustar weberweber#!/bin/sh # # sanlock - SAN-based lock manager # # chkconfig: 2345 97 03 # description: starts and stops sanlock daemon # ### BEGIN INIT INFO # Provides: sanlock # Required-Start: $time $syslog wdmd # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops sanlock daemon # Description: starts and stops sanlock daemon ### END INIT INFO . /etc/rc.d/init.d/functions prog="sanlock" lockfile="/var/run/$prog/$prog.pid" exec="/usr/sbin/$prog" SANLOCKUSER="sanlock" SANLOCKOPTS="-U $SANLOCKUSER -G $SANLOCKUSER" [ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog start() { [ -x $exec ] || exit 5 if [ ! -d /var/run/$prog ]; then install -d -o $SANLOCKUSER -g $SANLOCKUSER -m 775 /var/run/$prog [ -x /sbin/restorecon ] && restorecon /var/run/$prog fi echo -n $"Starting $prog: " daemon --user=$SANLOCKUSER $prog daemon $SANLOCKOPTS retval=$? echo [ $retval -eq 0 ] return $retval } stop() { echo -n $"Stopping $prog: " killproc -p $lockfile $prog -TERM retval=$? echo [ $retval -eq 0 ] } wait_for_stop() { while [ -e $lockfile ]; do sleep .5 done } restart() { stop wait_for_stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? sanlock-2.2/src/0000755000175100017510000000000011751766670012612 5ustar weberwebersanlock-2.2/src/monotime.c0000644000175100017510000000071511751766670014610 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include "monotime.h" uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } sanlock-2.2/src/crc32c.c0000644000175100017510000001034411751766670014037 0ustar weberweber/* * Copied from the btrfs-progs source code, which... * Copied from the kernel source code, lib/libcrc32c.c. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ #include #include /* * This is the CRC-32C table * Generated with: * width = 32 bits * poly = 0x1EDC6F41 * reflect input bytes = true * reflect output bytes = true */ static const uint32_t crc32c_table[256] = { 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L }; /* * Steps through buffer one byte at at time, calculates reflected * crc using table. */ uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length); uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length) { while (length--) crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8); return crc; } sanlock-2.2/src/limits.conf0000644000175100017510000000005111751766670014756 0ustar weberwebersanlock - memlock -1 sanlock - rtprio -1 sanlock-2.2/src/paxos_lease.c0000644000175100017510000013305011751766670015263 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "direct.h" #include "log.h" #include "lockspace.h" #include "delta_lease.h" #include "paxos_lease.h" #include "resource.h" uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length); int get_rand(int a, int b); #define DBLOCK_CHECKSUM_LEN 48 /* ends before checksum field */ struct paxos_dblock { uint64_t mbal; uint64_t bal; uint64_t inp; /* host_id */ uint64_t inp2; /* host_id generation */ uint64_t inp3; /* host_id's timestamp */ uint64_t lver; uint32_t checksum; }; static uint32_t roundup_power_of_two(uint32_t val) { val--; val |= val >> 1; val |= val >> 2; val |= val >> 4; val |= val >> 8; val |= val >> 16; val++; return val; } int paxos_lease_request_read(struct task *task, struct token *token, struct request_record *rr) { int rv; /* 1 = request record is second sector */ rv = read_sectors(&token->disks[0], 1, 1, (char *)rr, sizeof(struct request_record), task, "request"); if (rv < 0) return rv; return SANLK_OK; } int paxos_lease_request_write(struct task *task, struct token *token, struct request_record *rr) { int rv; rv = write_sector(&token->disks[0], 1, (char *)rr, sizeof(struct request_record), task, "request"); if (rv < 0) return rv; return SANLK_OK; } static int write_dblock(struct task *task, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) { int rv; /* 1 leader block + 1 request block; host_id N is block offset N-1 */ rv = write_sector(disk, 2 + host_id - 1, (char *)pd, sizeof(struct paxos_dblock), task, "dblock"); return rv; } static int write_leader(struct task *task, struct sync_disk *disk, struct leader_record *lr) { int rv; rv = write_sector(disk, 0, (char *)lr, sizeof(struct leader_record), task, "leader"); return rv; } #if 0 static int read_dblock(struct task *task, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) { int rv; /* 1 leader block + 1 request block; host_id N is block offset N-1 */ rv = read_sectors(disk, 2 + host_id - 1, 1, (char *)pd, sizeof(struct paxos_dblock), task, "dblock"); return rv; } static int read_dblocks(struct task *task, struct sync_disk *disk, struct paxos_dblock *pds, int pds_count) { char *data; int data_len, rv, i; data_len = pds_count * disk->sector_size; data = malloc(data_len); if (!data) { log_error("read_dblocks malloc %d %s", data_len, disk->path); rv = -ENOMEM; goto out; } /* 2 = 1 leader block + 1 request block */ rv = read_sectors(disk, 2, pds_count, data, data_len, task, "dblocks"); if (rv < 0) goto out_free; /* copy the first N bytes from each sector, where N is size of paxos_dblock */ for (i = 0; i < pds_count; i++) { memcpy(&pds[i], data + (i * disk->sector_size), sizeof(struct paxos_dblock)); } rv = 0; out_free: free(data); out: return rv; } #endif static int read_leader(struct task *task, struct sync_disk *disk, struct leader_record *lr) { int rv; /* 0 = leader record is first sector */ rv = read_sectors(disk, 0, 1, (char *)lr, sizeof(struct leader_record), task, "leader"); return rv; } static uint32_t dblock_checksum(struct paxos_dblock *pd) { return crc32c((uint32_t)~1, (uint8_t *)pd, DBLOCK_CHECKSUM_LEN); } static int verify_dblock(struct token *token, struct paxos_dblock *pd) { uint32_t sum; if (!pd->checksum && !pd->mbal && !pd->bal && !pd->inp && !pd->lver) return SANLK_OK; sum = dblock_checksum(pd); if (pd->checksum != sum) { log_errot(token, "verify_dblock wrong checksum %x %x", pd->checksum, sum); return SANLK_DBLOCK_CHECKSUM; } return SANLK_OK; } /* * It's possible that we pick a bk_max from another host which has our own * inp values in it, and we can end up commiting our own inp values, copied * from another host's dblock: * * host2 leader free * host2 phase1 mbal 14002 * host2 writes dblock[1] mbal 14002 * host2 reads no higher mbal * host2 choose own inp 2,1 * host2 phase2 mbal 14002 bal 14002 inp 2,1 * host2 writes dblock[1] bal 14002 inp 2,1 * host1 leader free * host1 phase1 mbal 20001 * host1 writes dblock[0] mbal 20001 * host1 reads no higher mbal * host1 choose dblock[1] bal 14002 inp 2,1 * host1 phase2 mbal 20001 bal 20001 inp 2,1 * host1 writes dblock[0] bal 20001 inp 2,1 * host2 reads dblock[0] mbal 20001 > 14002 * abort2, retry * host2 leader free * host2 phase1 mbal 16002 * host2 writes dblock[1] mbal 16002 * host2 reads dblock[0] mbal 20001 > 16002 * abort1 retry * host2 leader free * host2 phase1 mbal 18002 * host2 writes dblock[1] mbal 18002 * host2 reads dblock[0] mbal 20001 > 18002 * abort1 retry * host2 leader free * host2 phase1 mbal 20002 * host2 writes dblock[1] mbal 20002 * host2 reads no higher mbal * host2 choose dblock[0] bal 20001 inp 2,1 * host1 reads dblock[1] mbal 20002 > 20001 * abort2 retry * host2 phase2 mbal 20002 bal 20002 inp 2,1 * host2 writes dblock[1] bal 20002 inp 2,1 * host2 reads no higher mbal * host2 commit inp 2,1 * host2 success * host1 leader owner 2,1 * host1 fail */ static int run_ballot(struct task *task, struct token *token, int num_hosts, uint64_t next_lver, uint64_t our_mbal, struct paxos_dblock *dblock_out) { struct paxos_dblock dblock; struct paxos_dblock bk_max; struct paxos_dblock *bk; struct sync_disk *disk; char *iobuf[SANLK_MAX_DISKS]; char **p_iobuf[SANLK_MAX_DISKS]; int num_disks = token->r.num_disks; int num_writes, num_reads; int sector_size = token->disks[0].sector_size; int sector_count; int iobuf_len; int d, q, rv; int q_max = -1; int error; sector_count = roundup_power_of_two(num_hosts + 2); iobuf_len = sector_count * sector_size; if (!iobuf_len) return -EINVAL; for (d = 0; d < num_disks; d++) { p_iobuf[d] = &iobuf[d]; rv = posix_memalign((void *)p_iobuf[d], getpagesize(), iobuf_len); if (rv) return rv; } /* * phase 1 * * "For each disk d, it tries first to write dblock[p] to disk[d][p] * and then to read disk[d][q] for all other processors q. It aborts * the ballot if, for any d and q, it finds disk[d][q].mbal > * dblock[p].mbal. The phase completes when p has written and read a * majority of the disks, without reading any block whose mbal * component is greater than dblock[p].mbal." */ log_token(token, "ballot %llu phase1 mbal %llu", (unsigned long long)next_lver, (unsigned long long)our_mbal); memset(&dblock, 0, sizeof(struct paxos_dblock)); dblock.mbal = our_mbal; dblock.lver = next_lver; dblock.checksum = dblock_checksum(&dblock); memset(&bk_max, 0, sizeof(struct paxos_dblock)); num_writes = 0; for (d = 0; d < num_disks; d++) { rv = write_dblock(task, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "ballot %llu dblock write error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_WRITE; goto out; } num_reads = 0; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; if (!iobuf[d]) continue; memset(iobuf[d], 0, iobuf_len); rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; if (rv < 0) continue; num_reads++; for (q = 0; q < num_hosts; q++) { bk = (struct paxos_dblock *)(iobuf[d] + ((2 + q)*sector_size)); rv = verify_dblock(token, bk); if (rv < 0) continue; check_mode_block(token, q, (char *)bk); if (bk->lver < dblock.lver) continue; if (bk->lver > dblock.lver) { /* I don't think this should happen */ log_errot(token, "ballot %llu larger1 lver[%d] %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->lver); error = SANLK_DBLOCK_LVER; goto out; } /* see "It aborts the ballot" in comment above */ if (bk->mbal > dblock.mbal) { log_errot(token, "ballot %llu abort1 mbal %llu mbal[%d] %llu", (unsigned long long)next_lver, (unsigned long long)our_mbal, q, (unsigned long long)bk->mbal); error = SANLK_DBLOCK_MBAL; goto out; } /* see choosing inp for phase 2 in comment below */ if (!bk->inp) continue; if (!bk->bal) { log_errot(token, "ballot %llu zero bal inp[%d] %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->inp); continue; } if (bk->bal > bk_max.bal) { bk_max = *bk; q_max = q; } } } if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_READ; goto out; } /* * "When it completes phase 1, p chooses a new value of dblock[p].inp, * sets dblock[p].bal to dblock[p].mbal (its current ballot number), * and begins phase 2." * * "We now describe how processor p chooses the value of dblock[p].inp * that it tries to commit in phase 2. Let blocksSeen be the set * consisting of dblock[p] and all the records disk[d][q] read by p in * phase 1. Let nonInitBlks be the subset of blocksSeen consisting of * those records whose inp field is not NotAnInput. If nonInitBlks is * empty, then p sets dblock[p].inp to its own input value input[p]. * Otherwise, it sets dblock[p].inp to bk.inp for some record bk in * nonInitBlks having the largest value of bk.bal." */ if (bk_max.inp) { /* lver and mbal are already set */ dblock.inp = bk_max.inp; dblock.inp2 = bk_max.inp2; dblock.inp3 = bk_max.inp3; } else { /* lver and mbal are already set */ dblock.inp = token->host_id; dblock.inp2 = token->host_generation; dblock.inp3 = monotime(); } dblock.bal = dblock.mbal; dblock.checksum = dblock_checksum(&dblock); if (bk_max.inp) { /* not a problem, but interesting to see, so use log_error */ log_errot(token, "ballot %llu choose bk_max[%d] lver %llu mbal %llu bal %llu inp %llu %llu %llu", (unsigned long long)next_lver, q_max, (unsigned long long)bk_max.lver, (unsigned long long)bk_max.mbal, (unsigned long long)bk_max.bal, (unsigned long long)bk_max.inp, (unsigned long long)bk_max.inp2, (unsigned long long)bk_max.inp3); } /* * phase 2 * * Same description as phase 1, same sequence of writes/reads. */ log_token(token, "ballot %llu phase2 bal %llu inp %llu %llu %llu q_max %d", (unsigned long long)dblock.lver, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, q_max); num_writes = 0; for (d = 0; d < num_disks; d++) { rv = write_dblock(task, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "ballot %llu our dblock write2 error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_WRITE; goto out; } num_reads = 0; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; if (!iobuf[d]) continue; memset(iobuf[d], 0, iobuf_len); rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; if (rv < 0) continue; num_reads++; for (q = 0; q < num_hosts; q++) { bk = (struct paxos_dblock *)(iobuf[d] + ((2 + q)*sector_size)); rv = verify_dblock(token, bk); if (rv < 0) continue; if (bk->lver < dblock.lver) continue; if (bk->lver > dblock.lver) { /* * This happens when we choose another host's bk, that host * acquires the lease itself, releases it, and reacquires it * with a new lver, all before we get here, at which point * we see the larger lver. I believe case this would always * also be caught the the bk->mbal > dblock.mbal condition * below. */ log_errot(token, "ballot %llu larger2 lver[%d] %llu dblock %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->lver, (unsigned long long)dblock.lver); log_errot(token, "ballot %llu larger2 mbal[%d] %llu dblock %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->mbal, (unsigned long long)dblock.mbal); log_errot(token, "ballot %llu larger2 inp[%d] %llu %llu %llu dblock %llu %llu %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3); error = SANLK_DBLOCK_LVER; goto out; } /* see "It aborts the ballot" in comment above */ if (bk->mbal > dblock.mbal) { log_errot(token, "ballot %llu abort2 mbal %llu mbal[%d] %llu", (unsigned long long)next_lver, (unsigned long long)our_mbal, q, (unsigned long long)bk->mbal); error = SANLK_DBLOCK_MBAL; goto out; } } } if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read2 error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_READ; goto out; } /* "When it completes phase 2, p has committed dblock[p].inp." */ memcpy(dblock_out, &dblock, sizeof(struct paxos_dblock)); error = SANLK_OK; out: for (d = 0; d < num_disks; d++) { /* don't free iobufs that have timed out */ if (!iobuf[d]) continue; free(iobuf[d]); } return error; } uint32_t leader_checksum(struct leader_record *lr) { return crc32c((uint32_t)~1, (uint8_t *)lr, LEADER_CHECKSUM_LEN); } static void log_leader_error(int result, struct token *token, struct sync_disk *disk, struct leader_record *lr, const char *caller) { log_errot(token, "leader1 %s error %d sn %.48s rn %.48s", caller ? caller : "unknown", result, token->r.lockspace_name, token->r.name); log_errot(token, "leader2 path %s offset %llu fd %d", disk->path, (unsigned long long)disk->offset, disk->fd); log_errot(token, "leader3 m %x v %x ss %u nh %llu mh %llu oi %llu og %llu lv %llu", lr->magic, lr->version, lr->sector_size, (unsigned long long)lr->num_hosts, (unsigned long long)lr->max_hosts, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); log_errot(token, "leader4 sn %.48s rn %.48s ts %llu cs %x", lr->space_name, lr->resource_name, (unsigned long long)lr->timestamp, lr->checksum); log_errot(token, "leader5 wi %llu wg %llu wt %llu", (unsigned long long)lr->write_id, (unsigned long long)lr->write_generation, (unsigned long long)lr->write_timestamp); } static int verify_leader(struct token *token, struct sync_disk *disk, struct leader_record *lr, const char *caller) { struct leader_record leader_rr; uint32_t sum; int result, rv; if (lr->magic != PAXOS_DISK_MAGIC) { log_errot(token, "verify_leader wrong magic %x %s", lr->magic, disk->path); result = SANLK_LEADER_MAGIC; goto fail; } if ((lr->version & 0xFFFF0000) != PAXOS_DISK_VERSION_MAJOR) { log_errot(token, "verify_leader wrong version %x %s", lr->version, disk->path); result = SANLK_LEADER_VERSION; goto fail; } if (lr->sector_size != disk->sector_size) { log_errot(token, "verify_leader wrong sector size %d %d %s", lr->sector_size, disk->sector_size, disk->path); result = SANLK_LEADER_SECTORSIZE; goto fail; } if (strncmp(lr->space_name, token->r.lockspace_name, NAME_ID_SIZE)) { log_errot(token, "verify_leader wrong space name %.48s %.48s %s", lr->space_name, token->r.lockspace_name, disk->path); result = SANLK_LEADER_LOCKSPACE; goto fail; } if (strncmp(lr->resource_name, token->r.name, NAME_ID_SIZE)) { log_errot(token, "verify_leader wrong resource name %.48s %.48s %s", lr->resource_name, token->r.name, disk->path); result = SANLK_LEADER_RESOURCE; goto fail; } if (lr->num_hosts < token->host_id) { log_errot(token, "verify_leader num_hosts too small %llu %llu %s", (unsigned long long)lr->num_hosts, (unsigned long long)token->host_id, disk->path); result = SANLK_LEADER_NUMHOSTS; goto fail; } sum = leader_checksum(lr); if (lr->checksum != sum) { log_errot(token, "verify_leader wrong checksum %x %x %s", lr->checksum, sum, disk->path); result = SANLK_LEADER_CHECKSUM; goto fail; } return SANLK_OK; fail: log_leader_error(result, token, disk, lr, caller); memset(&leader_rr, 0, sizeof(leader_rr)); rv = read_sectors(disk, 0, 1, (char *)&leader_rr, sizeof(struct leader_record), NULL, "paxos_verify"); log_leader_error(rv, token, disk, &leader_rr, "paxos_verify"); return result; } static int leaders_match(struct leader_record *a, struct leader_record *b) { if (!memcmp(a, b, LEADER_COMPARE_LEN)) return 1; return 0; } static int _leader_read_one(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { struct leader_record leader; int rv; memset(&leader, 0, sizeof(struct leader_record)); rv = read_leader(task, &token->disks[0], &leader); if (rv < 0) return rv; rv = verify_leader(token, &token->disks[0], &leader, caller); /* copy what we read even if verify finds a problem */ memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* TODO: completely untested */ static int _leader_read_num(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { struct leader_record leader; struct leader_record *leaders; int *leader_reps; int leaders_len, leader_reps_len; int num_reads; int num_disks = token->r.num_disks; int rv = 0, d, i, found; int error; leaders_len = num_disks * sizeof(struct leader_record); leader_reps_len = num_disks * sizeof(int); leaders = malloc(leaders_len); if (!leaders) return -ENOMEM; leader_reps = malloc(leader_reps_len); if (!leader_reps) { free(leaders); return -ENOMEM; } /* * find a leader block that's consistent on the majority of disks, * so we can use as the basis for the new leader */ memset(&leader, 0, sizeof(struct leader_record)); memset(leaders, 0, leaders_len); memset(leader_reps, 0, leader_reps_len); num_reads = 0; for (d = 0; d < num_disks; d++) { rv = read_leader(task, &token->disks[d], &leaders[d]); if (rv < 0) continue; rv = verify_leader(token, &token->disks[d], &leaders[d], caller); if (rv < 0) continue; num_reads++; leader_reps[d] = 1; /* count how many times the same leader block repeats */ for (i = 0; i < d; i++) { if (leaders_match(&leaders[d], &leaders[i])) { leader_reps[i]++; break; } } } if (!majority_disks(num_disks, num_reads)) { log_errot(token, "%s leader read error %d", caller, rv); error = SANLK_LEADER_READ; goto out; } /* check that a majority of disks have the same leader */ found = 0; for (d = 0; d < num_disks; d++) { if (!majority_disks(num_disks, leader_reps[d])) continue; /* leader on d is the same on a majority of disks, leader becomes the prototype for new_leader */ memcpy(&leader, &leaders[d], sizeof(struct leader_record)); found = 1; break; } if (!found) { log_errot(token, "%s leader inconsistent", caller); error = SANLK_LEADER_DIFF; goto out; } error = SANLK_OK; out: memcpy(leader_ret, &leader, sizeof(struct leader_record)); free(leaders); free(leader_reps); return error; } int paxos_lease_leader_read(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { int rv; /* _leader_read_num works fine for the single disk case, but we can cut out a bunch of stuff when we know there's one disk */ if (token->r.num_disks > 1) rv = _leader_read_num(task, token, leader_ret, caller); else rv = _leader_read_one(task, token, leader_ret, caller); if (rv == SANLK_OK) log_token(token, "%s leader %llu owner %llu %llu %llu", caller, (unsigned long long)leader_ret->lver, (unsigned long long)leader_ret->owner_id, (unsigned long long)leader_ret->owner_generation, (unsigned long long)leader_ret->timestamp); return rv; } static int _lease_read_one(struct task *task, struct token *token, struct sync_disk *disk, struct leader_record *leader_ret, struct paxos_dblock *our_dblock, uint64_t *max_mbal, int *max_q, const char *caller) { char *iobuf, **p_iobuf; uint32_t host_id = token->host_id; uint32_t sector_size = disk->sector_size; struct paxos_dblock *bk; uint64_t tmp_mbal = 0; int q, tmp_q = -1, rv, iobuf_len; iobuf_len = direct_align(disk); if (iobuf_len < 0) return iobuf_len; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); rv = read_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task); if (rv < 0) goto out; memcpy(leader_ret, iobuf, sizeof(struct leader_record)); memcpy(our_dblock, iobuf + ((host_id + 1) * sector_size), sizeof(struct paxos_dblock)); rv = verify_leader(token, disk, leader_ret, caller); if (rv < 0) goto out; for (q = 0; q < leader_ret->num_hosts; q++) { bk = (struct paxos_dblock *)(iobuf + ((2 + q) * sector_size)); rv = verify_dblock(token, bk); if (rv < 0) goto out; if (!tmp_mbal || bk->mbal > tmp_mbal) { tmp_mbal = bk->mbal; tmp_q = q; } } *max_mbal = tmp_mbal; *max_q = tmp_q; out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } /* TODO: completely untested */ static int _lease_read_num(struct task *task, struct token *token, struct leader_record *leader_ret, struct paxos_dblock *our_dblock, uint64_t *max_mbal, int *max_q, const char *caller) { struct paxos_dblock dblock_one; struct leader_record leader_one; struct leader_record *leaders; uint64_t tmp_mbal = 0; uint64_t mbal_one; int *leader_reps; int num_disks = token->r.num_disks; int leaders_len, leader_reps_len; int i, d, rv, found, num_reads, q_one, tmp_q = -1; leaders_len = num_disks * sizeof(struct leader_record); leader_reps_len = num_disks * sizeof(int); leaders = malloc(leaders_len); if (!leaders) return -ENOMEM; leader_reps = malloc(leader_reps_len); if (!leader_reps) { free(leaders); return -ENOMEM; } memset(leaders, 0, leaders_len); memset(leader_reps, 0, leader_reps_len); num_reads = 0; for (d = 0; d < num_disks; d++) { rv = _lease_read_one(task, token, &token->disks[d], &leader_one, &dblock_one, &mbal_one, &q_one, caller); if (rv < 0) continue; num_reads++; if (!tmp_mbal || mbal_one > tmp_mbal) { tmp_mbal = mbal_one; tmp_q = q_one; memcpy(our_dblock, &dblock_one, sizeof(struct paxos_dblock)); } memcpy(&leaders[d], &leader_one, sizeof(struct leader_record)); leader_reps[d] = 1; /* count how many times the same leader block repeats */ for (i = 0; i < d; i++) { if (leaders_match(&leaders[d], &leaders[i])) { leader_reps[i]++; break; } } } *max_mbal = tmp_mbal; *max_q = tmp_q; if (!num_reads) { log_errot(token, "%s lease_read_num cannot read disks %d", caller, rv); rv = SANLK_DBLOCK_READ; goto out; } found = 0; for (d = 0; d < num_disks; d++) { if (!majority_disks(num_disks, leader_reps[d])) continue; /* leader on d is the same on a majority of disks, leader becomes the prototype for new_leader */ memcpy(leader_ret, &leaders[d], sizeof(struct leader_record)); found = 1; break; } if (!found) { log_errot(token, "%s lease_read_num leader inconsistent", caller); rv = SANLK_LEADER_DIFF; } out: free(leaders); free(leader_reps); return rv; } /* * read all the initial values needed to start disk paxos: * - the leader record * - our own dblock * - the max mbal from all dblocks * * Read the entire lease area in one i/o and copy all those * values from it. */ static int paxos_lease_read(struct task *task, struct token *token, struct leader_record *leader_ret, uint64_t *max_mbal, const char *caller) { struct paxos_dblock our_dblock; int rv, q = -1; if (token->r.num_disks > 1) rv = _lease_read_num(task, token, leader_ret, &our_dblock, max_mbal, &q, caller); else rv = _lease_read_one(task, token, &token->disks[0], leader_ret, &our_dblock, max_mbal, &q, caller); if (rv == SANLK_OK) log_token(token, "%s leader %llu owner %llu %llu %llu max mbal[%d] %llu " "our_dblock %llu %llu %llu %llu %llu %llu", caller, (unsigned long long)leader_ret->lver, (unsigned long long)leader_ret->owner_id, (unsigned long long)leader_ret->owner_generation, (unsigned long long)leader_ret->timestamp, q, (unsigned long long)*max_mbal, (unsigned long long)our_dblock.mbal, (unsigned long long)our_dblock.bal, (unsigned long long)our_dblock.inp, (unsigned long long)our_dblock.inp2, (unsigned long long)our_dblock.inp3, (unsigned long long)our_dblock.lver); return rv; } static int write_new_leader(struct task *task, struct token *token, struct leader_record *nl, const char *caller) { int num_disks = token->r.num_disks; int num_writes = 0; int error = SANLK_OK; int rv = 0, d; for (d = 0; d < num_disks; d++) { rv = write_leader(task, &token->disks[d], nl); if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "%s write_new_leader error %d owner %llu %llu %llu", caller, rv, (unsigned long long)nl->owner_id, (unsigned long long)nl->owner_generation, (unsigned long long)nl->timestamp); error = SANLK_LEADER_WRITE; } return error; } /* * If we hang or crash after completing a ballot successfully, but before * commiting the leader_record, then the next host that runs a ballot (with the * same lver since we did not commit the new lver to the leader_record) will * commit the same inp values that we were about to commit. If the inp values * they commit indicate we (who crashed or hung) are the new owner, then the * other hosts will begin monitoring the liveness of our host_id. Once enough * time has passed, they assume we're dead, and go on with new versions. The * "enough time" ensures that if we hung before writing the leader, that we * won't wake up and finally write what will then be an old invalid leader. */ /* * i/o required to acquire a free lease * (1 disk in token, 512 byte sectors, default num_hosts of 2000) * * paxos_lease_acquire() * paxos_lease_read() 1 read 1 MB (entire lease area) * run_ballot() * write_dblock() 1 write 512 bytes (1 dblock sector) * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors) * write_dblock() 1 write 512 bytes (1 dblock sector) * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors) * write_new_leader() 1 write 512 bytes (1 leader sector) * * 6 i/os = 3 1MB reads, 3 512 byte writes */ int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, uint64_t acquire_lver, int new_num_hosts) { struct sync_disk host_id_disk; struct leader_record host_id_leader; struct leader_record cur_leader; struct leader_record tmp_leader; struct leader_record new_leader; struct paxos_dblock dblock; struct host_status hs; uint64_t wait_start, now; uint64_t last_timestamp; uint64_t next_lver; uint64_t max_mbal; uint64_t num_mbal; uint64_t our_mbal; int copy_cur_leader = 0; int disk_open = 0; int error, rv, us; log_token(token, "paxos_acquire begin %x %llu %d", flags, (unsigned long long)acquire_lver, new_num_hosts); restart: error = paxos_lease_read(task, token, &cur_leader, &max_mbal, "paxos_acquire"); if (error < 0) goto out; if (flags & PAXOS_ACQUIRE_FORCE) { copy_cur_leader = 1; goto run; } if (acquire_lver && cur_leader.lver != acquire_lver) { log_errot(token, "paxos_acquire acquire_lver %llu cur_leader %llu", (unsigned long long)acquire_lver, (unsigned long long)cur_leader.lver); error = SANLK_ACQUIRE_LVER; goto out; } if (cur_leader.timestamp == LEASE_FREE) { log_token(token, "paxos_acquire leader %llu free", (unsigned long long)cur_leader.lver); copy_cur_leader = 1; goto run; } if (cur_leader.owner_id == token->host_id && cur_leader.owner_generation == token->host_generation) { log_token(token, "paxos_acquire already owner id %llu gen %llu", (unsigned long long)token->host_id, (unsigned long long)token->host_generation); copy_cur_leader = 1; goto run; } /* * Check if current owner is alive based on its host_id renewals. * If the current owner has been dead long enough we can assume that * its watchdog has triggered and we can go for the paxos lease. */ if (!disk_open) { memset(&host_id_disk, 0, sizeof(host_id_disk)); rv = lockspace_disk(cur_leader.space_name, &host_id_disk); if (rv < 0) { log_errot(token, "paxos_acquire no lockspace info %.48s", cur_leader.space_name); error = SANLK_ACQUIRE_LOCKSPACE; goto out; } host_id_disk.fd = -1; rv = open_disks_fd(&host_id_disk, 1); if (rv < 0) { log_errot(token, "paxos_acquire open host_id_disk error %d", rv); error = SANLK_ACQUIRE_IDDISK; goto out; } disk_open = 1; } rv = host_info(cur_leader.space_name, cur_leader.owner_id, &hs); if (!rv && hs.last_check && hs.last_live && hs.owner_id == cur_leader.owner_id && hs.owner_generation == cur_leader.owner_generation) { wait_start = hs.last_live; last_timestamp = hs.timestamp; } else { wait_start = monotime(); last_timestamp = 0; } log_token(token, "paxos_acquire owner %llu %llu %llu " "host_status %llu %llu %llu wait_start %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)hs.owner_id, (unsigned long long)hs.owner_generation, (unsigned long long)hs.timestamp, (unsigned long long)wait_start); while (1) { error = delta_lease_leader_read(task, &host_id_disk, cur_leader.space_name, cur_leader.owner_id, &host_id_leader, "paxos_acquire"); if (error < 0) { log_errot(token, "paxos_acquire owner %llu %llu %llu " "delta read %d fd %d path %s off %llu ss %u", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, error, host_id_disk.fd, host_id_disk.path, (unsigned long long)host_id_disk.offset, host_id_disk.sector_size); goto out; } /* a host_id cannot become free in less than host_dead_seconds after the final renewal because a host_id must first be acquired before being freed, and acquiring cannot take less than host_dead_seconds */ if (host_id_leader.timestamp == LEASE_FREE) { log_token(token, "paxos_acquire owner %llu delta free", (unsigned long long)cur_leader.owner_id); goto run; } /* another host has acquired the host_id of the host that owned this paxos lease; acquiring a host_id also cannot be done in less than host_dead_seconds, or the host_id that owns this lease may be alive, but it owned the lease in a previous generation without freeing it, and no longer owns it */ if (host_id_leader.owner_id != cur_leader.owner_id || host_id_leader.owner_generation > cur_leader.owner_generation) { log_token(token, "paxos_acquire owner %llu %llu %llu " "delta %llu %llu %llu mismatch", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp); goto run; } if (!last_timestamp) { last_timestamp = host_id_leader.timestamp; goto skip_live_check; } /* the owner is renewing its host_id so it's alive */ if (host_id_leader.timestamp != last_timestamp) { if (flags & PAXOS_ACQUIRE_QUIET_FAIL) { log_token(token, "paxos_acquire owner %llu " "delta %llu %llu %llu alive", (unsigned long long)cur_leader.owner_id, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp); } else { log_errot(token, "paxos_acquire owner %llu " "delta %llu %llu %llu alive", (unsigned long long)cur_leader.owner_id, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp); } memcpy(leader_ret, &cur_leader, sizeof(struct leader_record)); error = SANLK_ACQUIRE_IDLIVE; goto out; } /* if the owner hasn't renewed its host_id lease for host_dead_seconds then its watchdog should have fired by now */ now = monotime(); if (now - wait_start > task->host_dead_seconds) { log_token(token, "paxos_acquire owner %llu %llu %llu " "delta %llu %llu %llu dead %llu-%llu>%d", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp, (unsigned long long)now, (unsigned long long)wait_start, task->host_dead_seconds); goto run; } skip_live_check: /* TODO: test with sleep(2) here */ sleep(1); if (external_shutdown) { error = -1; goto out; } error = paxos_lease_leader_read(task, token, &tmp_leader, "paxos_acquire"); if (error < 0) goto out; if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) { log_token(token, "paxos_acquire restart leader changed"); goto restart; } } run: /* * Use the disk paxos algorithm to attempt to commit a new leader. * * If we complete a ballot successfully, we can commit a leader record * with next_lver. If we find a higher mbal during a ballot, we increase * our own mbal and try the ballot again. * * next_lver is derived from cur_leader with a zero or timed out owner. * We need to monitor the leader record to see if another host commits * a new leader_record with next_lver. * * TODO: may not need to increase mbal if dblock.inp and inp2 match * current host_id and generation? */ /* This next_lver assignment is based on the original cur_leader, not a re-reading of the leader here, i.e. we cannot just re-read the leader here, and make next_lver one more than that. This is because another node may have made us the owner of next_lver as it is now. */ next_lver = cur_leader.lver + 1; if (!max_mbal) { our_mbal = token->host_id; } else { num_mbal = max_mbal - (max_mbal % cur_leader.max_hosts); our_mbal = num_mbal + cur_leader.max_hosts + token->host_id; } retry_ballot: if (copy_cur_leader) { /* reusing the initial read removes an iop in the common case */ copy_cur_leader = 0; memcpy(&tmp_leader, &cur_leader, sizeof(struct leader_record)); } else { error = paxos_lease_leader_read(task, token, &tmp_leader, "paxos_acquire"); if (error < 0) goto out; } if (tmp_leader.lver == next_lver) { /* * another host has commited a leader_record for next_lver, * check which inp (owner_id) they commited (possibly us). */ if (tmp_leader.owner_id == token->host_id && tmp_leader.owner_generation == token->host_generation) { /* not a problem, but interesting to see, so use log_error */ log_errot(token, "paxos_acquire %llu owner is our inp " "%llu %llu %llu commited by %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp, (unsigned long long)tmp_leader.write_id); memcpy(leader_ret, &tmp_leader, sizeof(struct leader_record)); error = SANLK_OK; } else { /* not a problem, but interesting to see, so use log_error */ log_errot(token, "paxos_acquire %llu owner is %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); memcpy(leader_ret, &tmp_leader, sizeof(struct leader_record)); error = SANLK_ACQUIRE_OWNED; } goto out; } if (tmp_leader.lver > next_lver) { /* * A case where this was observed: for next_lver 65 we abort1, and delay. * While sleeping, the lease v65 (which was acquired during our abort1) is * released and then reacquired as v66. When we goto retry_ballot, our * next_lver is 65, but the current lver on disk is 66, causing us to * we fail in the larger1 check.) */ log_token(token, "paxos_acquire stale next_lver %llu now %llu owner %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.lver, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); goto restart; } if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) { /* I don't think this should ever happen. */ log_errot(token, "paxos_acquire restart leader changed2"); goto restart; } error = run_ballot(task, token, cur_leader.num_hosts, next_lver, our_mbal, &dblock); if (error == SANLK_DBLOCK_MBAL) { us = get_rand(0, 1000000); if (us < 0) us = token->host_id * 100; /* not a problem, but interesting to see, so use log_error */ log_errot(token, "paxos_acquire %llu retry delay %d us", (unsigned long long)next_lver, us); usleep(us); our_mbal += cur_leader.max_hosts; goto retry_ballot; } if (error < 0) { log_errot(token, "paxos_acquire %llu ballot error %d", (unsigned long long)next_lver, error); goto out; } /* ballot success, commit next_lver with dblock values */ memcpy(&new_leader, &cur_leader, sizeof(struct leader_record)); new_leader.lver = dblock.lver; new_leader.owner_id = dblock.inp; new_leader.owner_generation = dblock.inp2; new_leader.timestamp = dblock.inp3; new_leader.write_id = token->host_id; new_leader.write_generation = token->host_generation; new_leader.write_timestamp = monotime(); if (new_num_hosts) new_leader.num_hosts = new_num_hosts; if (new_leader.owner_id == token->host_id) { /* * The LFL_SHORT_HOLD flag is just a "hint" to help * other nodes be more intelligent about retrying * due to transient failures when acquiring shared * leases. Only modify SHORT_HOLD if we're commiting * ourself as the new owner. If we're commiting another * host as owner, we don't know if they are acquiring * shared or not. */ if (flags & PAXOS_ACQUIRE_SHARED) new_leader.flags |= LFL_SHORT_HOLD; else new_leader.flags &= ~LFL_SHORT_HOLD; } new_leader.checksum = leader_checksum(&new_leader); error = write_new_leader(task, token, &new_leader, "paxos_acquire"); if (error < 0) goto out; if (new_leader.owner_id != token->host_id) { /* not a problem, but interesting to see, so use log_error */ log_errot(token, "ballot %llu commit other owner %llu %llu %llu", (unsigned long long)new_leader.lver, (unsigned long long)new_leader.owner_id, (unsigned long long)new_leader.owner_generation, (unsigned long long)new_leader.timestamp); memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); error = SANLK_ACQUIRE_OTHER; goto out; } log_token(token, "ballot %llu commit self owner %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)new_leader.owner_id, (unsigned long long)new_leader.owner_generation, (unsigned long long)new_leader.timestamp); memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); error = SANLK_OK; out: if (disk_open) close_disks(&host_id_disk, 1); return error; } #if 0 int paxos_lease_renew(struct task *task, struct token *token, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record new_leader; int rv, d; int error; for (d = 0; d < token->r.num_disks; d++) { memset(&new_leader, 0, sizeof(struct leader_record)); rv = read_leader(task, &token->disks[d], &new_leader); if (rv < 0) continue; if (memcmp(&new_leader, leader_last, sizeof(struct leader_record))) { log_errot(token, "leader changed between renewals"); return SANLK_BAD_LEADER; } } new_leader.timestamp = monotime(); new_leader.checksum = leader_checksum(&new_leader); error = write_new_leader(task, token, &new_leader); if (error < 0) goto out; memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); out: return error; } #endif int paxos_lease_release(struct task *task, struct token *token, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record leader; int error; error = paxos_lease_leader_read(task, token, &leader, "paxos_release"); if (error < 0) { log_errot(token, "paxos_release leader_read error %d", error); goto out; } if (leader.lver != leader_last->lver) { log_errot(token, "paxos_release %llu other lver %llu", (unsigned long long)leader_last->lver, (unsigned long long)leader.lver); return SANLK_RELEASE_LVER; } if (leader.owner_id != token->host_id || leader.owner_generation != token->host_generation) { log_errot(token, "paxos_release %llu other owner %llu %llu %llu", (unsigned long long)leader_last->lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp); return SANLK_RELEASE_OWNER; } if (memcmp(&leader, leader_last, sizeof(struct leader_record))) { /* * This will happen when two hosts finish the same ballot * successfully, the second commiting the same inp values * that the first did, as it should. But the second will * write it's own write_id/gen/timestap, which will differ * from what the first host wrote. So when the first host * rereads here in the release, it will find different * write_id/gen/timestamp from what it wrote. This is * perfectly fine (use log_error since it's interesting * to see when this happens.) */ log_errot(token, "paxos_release %llu leader different " "write %llu %llu %llu vs %llu %llu %llu", (unsigned long long)leader_last->lver, (unsigned long long)leader_last->write_id, (unsigned long long)leader_last->write_generation, (unsigned long long)leader_last->write_timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); /* log_leader_error(0, token, &token->disks[0], leader_last, "paxos_release"); log_leader_error(0, token, &token->disks[0], &leader, "paxos_release"); */ } leader.timestamp = LEASE_FREE; leader.write_id = token->host_id; leader.write_generation = token->host_generation; leader.write_timestamp = monotime(); leader.flags &= ~LFL_SHORT_HOLD; leader.checksum = leader_checksum(&leader); error = write_new_leader(task, token, &leader, "paxos_release"); if (error < 0) goto out; memcpy(leader_ret, &leader, sizeof(struct leader_record)); out: return error; } int paxos_lease_init(struct task *task, struct token *token, int num_hosts, int max_hosts) { char *iobuf, **p_iobuf; struct leader_record *leader; struct request_record *rr; int iobuf_len; int sector_size; int align_size; int aio_timeout = 0; int rv, d; if (!num_hosts) num_hosts = DEFAULT_MAX_HOSTS; if (!max_hosts) max_hosts = DEFAULT_MAX_HOSTS; sector_size = token->disks[0].sector_size; align_size = direct_align(&token->disks[0]); if (align_size < 0) return align_size; if (sector_size * (2 + max_hosts) > align_size) return -E2BIG; iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); leader = (struct leader_record *)iobuf; leader->magic = PAXOS_DISK_MAGIC; leader->version = PAXOS_DISK_VERSION_MAJOR | PAXOS_DISK_VERSION_MINOR; leader->sector_size = sector_size; leader->num_hosts = num_hosts; leader->max_hosts = max_hosts; leader->timestamp = LEASE_FREE; strncpy(leader->space_name, token->r.lockspace_name, NAME_ID_SIZE); strncpy(leader->resource_name, token->r.name, NAME_ID_SIZE); leader->checksum = leader_checksum(leader); rr = (struct request_record *)(iobuf + sector_size); rr->magic = REQ_DISK_MAGIC; rr->version = REQ_DISK_VERSION_MAJOR | REQ_DISK_VERSION_MINOR; for (d = 0; d < token->r.num_disks; d++) { rv = write_iobuf(token->disks[d].fd, token->disks[d].offset, iobuf, iobuf_len, task); if (rv == SANLK_AIO_TIMEOUT) aio_timeout = 1; if (rv < 0) return rv; } if (!aio_timeout) free(iobuf); return 0; } sanlock-2.2/src/sysconfig.sanlock0000644000175100017510000000106011751766670016167 0ustar weberweber# SANLOCKUSER -- the daemon should run as this user # # To run as root user instead of sanlock user #SANLOCKUSER="root" # SANLOCKOPTS -- set the command line options for the sanlock daemon # See sanlock man page for full list of command line options. # # Include "-U sanlock -G sanlock" in the option string unless # also changing the SANLOCKUSER above. # # To disable use of watchdog via wdmd #SANLOCKOPTS="-U sanlock -G sanlock -w 0" # # To disable use of watchdog via wdmd and disable high priority features #SANLOCKOPTS="-U sanlock -G sanlock -w 0 -h 0" sanlock-2.2/src/mode_block.h0000644000175100017510000000102211751766670015054 0ustar weberweber/* * Copyright 2012 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __MODE_BLOCK_H__ #define __MODE_BLOCK_H__ #define MBLOCK_OFFSET 128 /* include paxos_dblock plus padding */ #define MBLOCK_SHARED 0x00000001 struct mode_block { uint32_t flags; uint64_t generation; }; #endif sanlock-2.2/src/paxos_lease.h0000644000175100017510000000251511751766670015271 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __PAXOS_LEASE_H__ #define __PAXOS_LEASE_H__ #define PAXOS_ACQUIRE_FORCE 0x00000001 #define PAXOS_ACQUIRE_QUIET_FAIL 0x00000002 #define PAXOS_ACQUIRE_SHARED 0x00000004 uint32_t leader_checksum(struct leader_record *lr); int paxos_lease_leader_read(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller); int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, uint64_t acquire_lver, int new_num_hosts); int paxos_lease_release(struct task *task, struct token *token, struct leader_record *leader_last, struct leader_record *leader_ret); int paxos_lease_init(struct task *task, struct token *token, int num_hosts, int max_hosts); int paxos_lease_request_read(struct task *task, struct token *token, struct request_record *rr); int paxos_lease_request_write(struct task *task, struct token *token, struct request_record *rr); #endif sanlock-2.2/src/lockspace.c0000644000175100017510000005051311751766670014726 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_sock.h" #include "diskio.h" #include "log.h" #include "delta_lease.h" #include "lockspace.h" #include "resource.h" #include "watchdog.h" #include "task.h" #include "direct.h" static uint32_t space_id_counter = 1; static struct space *_search_space(char *name, struct sync_disk *disk, uint64_t host_id, struct list_head *head1, struct list_head *head2, struct list_head *head3) { struct space *sp; if (head1) { list_for_each_entry(sp, head1, list) { if (name && strncmp(sp->space_name, name, NAME_ID_SIZE)) continue; if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN)) continue; if (disk && sp->host_id_disk.offset != disk->offset) continue; if (host_id && sp->host_id != host_id) continue; return sp; } } if (head2) { list_for_each_entry(sp, head2, list) { if (name && strncmp(sp->space_name, name, NAME_ID_SIZE)) continue; if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN)) continue; if (disk && sp->host_id_disk.offset != disk->offset) continue; if (host_id && sp->host_id != host_id) continue; return sp; } } if (head3) { list_for_each_entry(sp, head3, list) { if (name && strncmp(sp->space_name, name, NAME_ID_SIZE)) continue; if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN)) continue; if (disk && sp->host_id_disk.offset != disk->offset) continue; if (host_id && sp->host_id != host_id) continue; return sp; } } return NULL; } struct space *find_lockspace(char *name) { return _search_space(name, NULL, 0, &spaces, &spaces_rem, &spaces_add); } int _lockspace_info(char *space_name, struct space *sp_out) { struct space *sp; list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; memcpy(sp_out, sp, sizeof(struct space)); return 0; } return -1; } int lockspace_info(char *space_name, struct space *sp_out) { int rv; pthread_mutex_lock(&spaces_mutex); rv = _lockspace_info(space_name, sp_out); pthread_mutex_unlock(&spaces_mutex); return rv; } int lockspace_disk(char *space_name, struct sync_disk *disk) { struct space space; int rv; pthread_mutex_lock(&spaces_mutex); rv = _lockspace_info(space_name, &space); if (!rv) { memcpy(disk, &space.host_id_disk, sizeof(struct sync_disk)); disk->fd = -1; } pthread_mutex_unlock(&spaces_mutex); return rv; } #if 0 static void clear_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = host_id % 8; *byte &= ~bit; } #endif void set_id_bit(int host_id, char *bitmap, char *c) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; *byte |= mask; if (c) *c = *byte; } /* FIXME: another copy in direct_lib.c */ int test_id_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; return (*byte & mask); } int host_status_set_bit(char *space_name, uint64_t host_id) { struct space *sp; int found = 0; if (!host_id || host_id > DEFAULT_MAX_HOSTS) return -EINVAL; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; found = 1; break; } pthread_mutex_unlock(&spaces_mutex); if (!found) return -ENOSPC; pthread_mutex_lock(&sp->mutex); sp->host_status[host_id-1].set_bit_time = monotime(); pthread_mutex_unlock(&sp->mutex); return 0; } int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out) { struct space *sp; int found = 0; if (!host_id || host_id > DEFAULT_MAX_HOSTS) return -EINVAL; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; memcpy(hs_out, &sp->host_status[host_id-1], sizeof(struct host_status)); found = 1; break; } pthread_mutex_unlock(&spaces_mutex); if (!found) return -ENOSPC; return 0; } static void create_bitmap(struct task *task, struct space *sp, char *bitmap) { uint64_t now; int i; char c; now = monotime(); pthread_mutex_lock(&sp->mutex); for (i = 0; i < DEFAULT_MAX_HOSTS; i++) { if (i+1 == sp->host_id) continue; if (!sp->host_status[i].set_bit_time) continue; if (now - sp->host_status[i].set_bit_time > task->request_finish_seconds) { log_space(sp, "bitmap clear host_id %d", i+1); sp->host_status[i].set_bit_time = 0; } else { set_id_bit(i+1, bitmap, &c); log_space(sp, "bitmap set host_id %d byte %x", i+1, c); } } pthread_mutex_unlock(&sp->mutex); } void check_other_leases(struct task *task, struct space *sp, char *buf) { struct leader_record *leader; struct sync_disk *disk; struct host_status *hs; char *bitmap; uint64_t now; int i, new; disk = &sp->host_id_disk; now = monotime(); new = 0; for (i = 0; i < DEFAULT_MAX_HOSTS; i++) { hs = &sp->host_status[i]; hs->last_check = now; if (!hs->first_check) hs->first_check = now; leader = (struct leader_record *)(buf + (i * disk->sector_size)); if (hs->owner_id == leader->owner_id && hs->owner_generation == leader->owner_generation && hs->timestamp == leader->timestamp) { continue; } hs->owner_id = leader->owner_id; hs->owner_generation = leader->owner_generation; hs->timestamp = leader->timestamp; hs->last_live = now; if (i+1 == sp->host_id) continue; bitmap = (char *)leader + HOSTID_BITMAP_OFFSET; if (!test_id_bit(sp->host_id, bitmap)) continue; /* this host has made a request for us, we won't take a new request from this host for another request_finish_seconds */ if (now - hs->last_req < task->request_finish_seconds) continue; log_space(sp, "request from host_id %d", i+1); hs->last_req = now; new = 1; } if (new) set_resource_examine(sp->space_name, NULL); } /* * check if our_host_id_thread has renewed within timeout */ int check_our_lease(struct task *task, struct space *sp, int *check_all, char *check_buf) { uint64_t last_success; int corrupt_result; int gap; pthread_mutex_lock(&sp->mutex); last_success = sp->lease_status.renewal_last_success; corrupt_result = sp->lease_status.corrupt_result; if (sp->lease_status.renewal_read_count > sp->lease_status.renewal_read_check) { /* main loop will pass this buf to check_other_leases next */ sp->lease_status.renewal_read_check = sp->lease_status.renewal_read_count; *check_all = 1; if (check_buf) memcpy(check_buf, sp->lease_status.renewal_read_buf, sp->align_size); } pthread_mutex_unlock(&sp->mutex); if (corrupt_result) { log_erros(sp, "check_our_lease corrupt %d", corrupt_result); return -1; } gap = monotime() - last_success; if (gap >= task->id_renewal_fail_seconds) { log_erros(sp, "check_our_lease failed %d", gap); return -1; } if (gap >= task->id_renewal_warn_seconds) { log_erros(sp, "check_our_lease warning %d last_success %llu", gap, (unsigned long long)last_success); } if (com.debug_renew > 1) { log_space(sp, "check_our_lease good %d %llu", gap, (unsigned long long)last_success); } return 0; } /* If a renewal result is one of the listed errors, it means our delta lease has been corrupted/overwritten/reinitialized out from under us, and we should stop using it immediately. There's no point in retrying the renewal. */ static int corrupt_result(int result) { switch (result) { case SANLK_RENEW_OWNER: case SANLK_RENEW_DIFF: case SANLK_LEADER_MAGIC: case SANLK_LEADER_VERSION: case SANLK_LEADER_SECTORSIZE: case SANLK_LEADER_LOCKSPACE: case SANLK_LEADER_CHECKSUM: return result; default: return 0; } } static void *lockspace_thread(void *arg_in) { char bitmap[HOSTID_BITMAP_SIZE]; struct task task; struct space *sp; struct leader_record leader; uint64_t delta_begin, last_success; int rv, delta_length, renewal_interval; int acquire_result, delta_result, read_result; int opened = 0; int stop = 0; sp = (struct space *)arg_in; memset(&task, 0, sizeof(struct task)); setup_task_timeouts(&task, main_task.io_timeout_seconds); setup_task_aio(&task, main_task.use_aio, HOSTID_AIO_CB_SIZE); memcpy(task.name, sp->space_name, NAME_ID_SIZE); delta_begin = monotime(); rv = open_disk(&sp->host_id_disk); if (rv < 0) { log_erros(sp, "open_disk %s error %d", sp->host_id_disk.path, rv); acquire_result = -ENODEV; delta_result = -1; goto set_status; } opened = 1; sp->align_size = direct_align(&sp->host_id_disk); if (sp->align_size < 0) { log_erros(sp, "direct_align error"); acquire_result = sp->align_size; delta_result = -1; goto set_status; } sp->lease_status.renewal_read_buf = malloc(sp->align_size); if (!sp->lease_status.renewal_read_buf) { acquire_result = -ENOMEM; delta_result = -1; goto set_status; } /* * acquire the delta lease */ delta_begin = monotime(); delta_result = delta_lease_acquire(&task, sp, &sp->host_id_disk, sp->space_name, our_host_name_global, sp->host_id, &leader); delta_length = monotime() - delta_begin; if (delta_result == SANLK_OK) last_success = leader.timestamp; acquire_result = delta_result; /* we need to start the watchdog after we acquire the host_id but before we allow any pid's to begin running */ if (delta_result == SANLK_OK) { rv = create_watchdog_file(sp, last_success); if (rv < 0) { log_erros(sp, "create_watchdog failed %d", rv); acquire_result = SANLK_ERROR; } } set_status: pthread_mutex_lock(&sp->mutex); sp->lease_status.acquire_last_result = acquire_result; sp->lease_status.acquire_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.acquire_last_success = last_success; sp->lease_status.renewal_last_result = acquire_result; sp->lease_status.renewal_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.renewal_last_success = last_success; pthread_mutex_unlock(&sp->mutex); if (acquire_result < 0) goto out; sp->host_generation = leader.owner_generation; while (1) { pthread_mutex_lock(&sp->mutex); stop = sp->thread_stop; pthread_mutex_unlock(&sp->mutex); if (stop) break; /* * wait between each renewal */ if (monotime() - last_success < task.id_renewal_seconds) { sleep(1); continue; } else { /* don't spin too quickly if renew is failing immediately and repeatedly */ usleep(500000); } /* * do a renewal, measuring length of time spent in renewal, * and the length of time between successful renewals */ memset(bitmap, 0, sizeof(bitmap)); create_bitmap(&task, sp, bitmap); delta_begin = monotime(); delta_result = delta_lease_renew(&task, sp, &sp->host_id_disk, sp->space_name, bitmap, delta_result, &read_result, &leader, &leader); delta_length = monotime() - delta_begin; if (delta_result == SANLK_OK) { renewal_interval = leader.timestamp - last_success; last_success = leader.timestamp; } /* * publish the results */ pthread_mutex_lock(&sp->mutex); sp->lease_status.renewal_last_result = delta_result; sp->lease_status.renewal_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.renewal_last_success = last_success; if (delta_result != SANLK_OK && !sp->lease_status.corrupt_result) sp->lease_status.corrupt_result = corrupt_result(delta_result); if (read_result == SANLK_OK && task.iobuf) { memcpy(sp->lease_status.renewal_read_buf, task.iobuf, sp->align_size); sp->lease_status.renewal_read_count++; } /* * pet the watchdog * (don't update on thread_stop because it's probably unlinked) */ if (delta_result == SANLK_OK && !sp->thread_stop) update_watchdog_file(sp, last_success); pthread_mutex_unlock(&sp->mutex); /* * log the results */ if (delta_result != SANLK_OK) { log_erros(sp, "renewal error %d delta_length %d last_success %llu", delta_result, delta_length, (unsigned long long)last_success); } else if (delta_length > task.id_renewal_seconds) { log_erros(sp, "renewed %llu delta_length %d too long", (unsigned long long)last_success, delta_length); } else if (com.debug_renew) { log_space(sp, "renewed %llu delta_length %d interval %d", (unsigned long long)last_success, delta_length, renewal_interval); } } /* watchdog unlink was done in main_loop when thread_stop was set, to get it done as quickly as possible in case the wd is about to fire. */ close_watchdog_file(sp); out: if (delta_result == SANLK_OK) delta_lease_release(&task, sp, &sp->host_id_disk, sp->space_name, &leader, &leader); if (opened) close(sp->host_id_disk.fd); close_task_aio(&task); return NULL; } static void free_sp(struct space *sp) { if (sp->lease_status.renewal_read_buf) free(sp->lease_status.renewal_read_buf); free(sp); } int add_lockspace_start(struct sanlk_lockspace *ls, struct space **sp_out) { struct space *sp, *sp2; int rv; if (!ls->name[0] || !ls->host_id || !ls->host_id_disk.path[0]) { log_error("add_lockspace bad args id %llu name %zu path %zu", (unsigned long long)ls->host_id, strlen(ls->name), strlen(ls->host_id_disk.path)); return -EINVAL; } sp = malloc(sizeof(struct space)); if (!sp) return -ENOMEM; memset(sp, 0, sizeof(struct space)); memcpy(sp->space_name, ls->name, NAME_ID_SIZE); memcpy(&sp->host_id_disk, &ls->host_id_disk, sizeof(struct sanlk_disk)); sp->host_id_disk.sector_size = 0; sp->host_id_disk.fd = -1; sp->host_id = ls->host_id; pthread_mutex_init(&sp->mutex, NULL); pthread_mutex_lock(&spaces_mutex); /* search all lists for an identical lockspace */ sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EEXIST; goto fail_free; } sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces_add, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EINPROGRESS; goto fail_free; } sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces_rem, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EAGAIN; goto fail_free; } /* search all lists for a lockspace with the same name */ sp2 = _search_space(sp->space_name, NULL, 0, &spaces, &spaces_add, &spaces_rem); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EINVAL; goto fail_free; } /* search all lists for a lockspace with the same host_id_disk */ sp2 = _search_space(NULL, &sp->host_id_disk, 0, &spaces, &spaces_add, &spaces_rem); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EINVAL; goto fail_free; } sp->space_id = space_id_counter++; list_add(&sp->list, &spaces_add); pthread_mutex_unlock(&spaces_mutex); /* save a record of what this space_id is for later debugging */ log_level(sp->space_id, 0, NULL, LOG_WARNING, "lockspace %.48s:%llu:%.256s:%llu", sp->space_name, (unsigned long long)sp->host_id, sp->host_id_disk.path, (unsigned long long)sp->host_id_disk.offset); rv = pthread_create(&sp->thread, NULL, lockspace_thread, sp); if (rv < 0) { log_erros(sp, "add_lockspace create thread failed"); goto fail_del; } *sp_out = sp; return 0; fail_del: pthread_mutex_lock(&spaces_mutex); list_del(&sp->list); pthread_mutex_unlock(&spaces_mutex); fail_free: free_sp(sp); return rv; } int add_lockspace_wait(struct space *sp) { int rv, result; while (1) { pthread_mutex_lock(&sp->mutex); result = sp->lease_status.acquire_last_result; pthread_mutex_unlock(&sp->mutex); if (result) break; sleep(1); } if (result != SANLK_OK) { /* the thread exits right away if acquire fails */ pthread_join(sp->thread, NULL); rv = result; goto fail_del; } /* once we move sp to spaces list, tokens can begin using it, and the main loop will begin monitoring its renewals */ pthread_mutex_lock(&spaces_mutex); if (sp->external_remove || external_shutdown) { rv = -1; pthread_mutex_unlock(&spaces_mutex); goto fail_del; } list_move(&sp->list, &spaces); pthread_mutex_unlock(&spaces_mutex); return 0; fail_del: pthread_mutex_lock(&spaces_mutex); list_del(&sp->list); pthread_mutex_unlock(&spaces_mutex); free_sp(sp); return rv; } int inq_lockspace(struct sanlk_lockspace *ls) { int rv; struct space *sp; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, NULL, NULL); if (sp) { rv = 0; goto out; } else { rv = -ENOENT; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_add, &spaces_rem, NULL); if (sp) rv = -EINPROGRESS; out: pthread_mutex_unlock(&spaces_mutex); return rv; } int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id) { struct space *sp; unsigned int id; int rv; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_rem, NULL, NULL); if (sp) { pthread_mutex_unlock(&spaces_mutex); rv = -EINPROGRESS; goto out; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_add, NULL, NULL); if (sp) { sp->external_remove = 1; pthread_mutex_unlock(&spaces_mutex); rv = 0; goto out; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, NULL, NULL); if (!sp) { pthread_mutex_unlock(&spaces_mutex); rv = -ENOENT; goto out; } /* * Removal happens in a round about way: * - we set external_remove * - main_loop sees external_remove and sets space_dead, killing_pids * - main_loop sees killing_pids and all pids dead, sets thread_stop, * and moves sp from spaces to spaces_rem * - main_loop calls free_lockspaces(0), which joins any * lockspace_thread that is done, and then frees sp * * Once we release spaces_mutex, the sp could be freed any time, * so we can't touch it. Use its space_id to check for completion. */ sp->external_remove = 1; id = sp->space_id; pthread_mutex_unlock(&spaces_mutex); *space_id = id; rv = 0; out: return rv; } /* check for matching space_id in case the lockspace is added again */ int rem_lockspace_wait(struct sanlk_lockspace *ls, unsigned int space_id) { struct space *sp; int done; while (1) { pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, &spaces_rem, NULL); if (sp && (sp->space_id == space_id)) done = 0; else done = 1; pthread_mutex_unlock(&spaces_mutex); if (done) break; sleep(1); } return 0; } /* * we call stop_host_id() when all pids are gone and we're in a safe state, so * it's safe to unlink the watchdog right away here. We want to sp the unlink * as soon as it's safe, so we can reduce the chance we get killed by the * watchdog (we could actually call this in main_loop just before the break). * Getting this unlink done quickly is more important than doing at the more * "logical" point commented above in host_id_thread. */ static int stop_lockspace_thread(struct space *sp, int wait) { int stop, rv; pthread_mutex_lock(&sp->mutex); stop = sp->thread_stop; sp->thread_stop = 1; pthread_mutex_unlock(&sp->mutex); if (!stop) { /* should never happen */ log_erros(sp, "stop_lockspace_thread zero thread_stop"); return -EINVAL; } if (wait) rv = pthread_join(sp->thread, NULL); else rv = pthread_tryjoin_np(sp->thread, NULL); return rv; } void free_lockspaces(int wait) { struct space *sp, *safe; int rv; pthread_mutex_lock(&spaces_mutex); list_for_each_entry_safe(sp, safe, &spaces_rem, list) { rv = stop_lockspace_thread(sp, wait); if (!rv) { log_space(sp, "free lockspace"); list_del(&sp->list); free_sp(sp); } } pthread_mutex_unlock(&spaces_mutex); } sanlock-2.2/src/sanlock_sock.c0000644000175100017510000000135511751766670015433 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_sock.h" int sanlock_socket_address(struct sockaddr_un *addr) { memset(addr, 0, sizeof(struct sockaddr_un)); addr->sun_family = AF_LOCAL; snprintf(addr->sun_path, sizeof(addr->sun_path) - 1, "%s/%s", SANLK_RUN_DIR, SANLK_SOCKET_NAME); return 0; } sanlock-2.2/src/lockfile.c0000644000175100017510000000371411751766670014553 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_sock.h" #include "log.h" #include "lockfile.h" int lockfile(const char *dir, const char *name) { char path[PATH_MAX]; char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; old_umask = umask(0022); rv = mkdir(SANLK_RUN_DIR, 0777); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); snprintf(path, PATH_MAX, "%s/%s", dir, name); fd = open(path, O_CREAT|O_WRONLY|O_CLOEXEC, 0666); if (fd < 0) { log_error("lockfile open error %s: %s", path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } void unlink_lockfile(int fd, const char *dir, const char *name) { char path[PATH_MAX]; snprintf(path, PATH_MAX, "%s/%s", dir, name); unlink(path); close(fd); } sanlock-2.2/src/watchdog.c0000644000175100017510000000746711751766670014574 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "watchdog.h" /* * Purpose of watchdog: to forcibly reset the host in the case where a * supervised pid is running but sanlock daemon does not renew its lease * and does not kill the pid (or it kills the pid but the pid does not * exit). So, just before the pid begins running with granted leases, * /dev/watchdog needs to be armed to reboot the host if things go bad right * after the pid goes ahead. */ #include "../wdmd/wdmd.h" static int daemon_wdmd_con; void update_watchdog_file(struct space *sp, uint64_t timestamp) { int rv; if (!com.use_watchdog) return; rv = wdmd_test_live(sp->wd_fd, timestamp, timestamp + main_task.id_renewal_fail_seconds); if (rv < 0) log_erros(sp, "wdmd_test_live failed %d", rv); } int create_watchdog_file(struct space *sp, uint64_t timestamp) { char name[WDMD_NAME_SIZE]; int con, rv; if (!com.use_watchdog) return 0; con = wdmd_connect(); if (con < 0) { log_erros(sp, "wdmd connect failed %d", con); goto fail; } memset(name, 0, sizeof(name)); snprintf(name, WDMD_NAME_SIZE - 1, "sanlock_%s_hostid%llu", sp->space_name, (unsigned long long)sp->host_id); rv = wdmd_register(con, name); if (rv < 0) { log_erros(sp, "wdmd register failed %d", rv); goto fail_close; } rv = wdmd_test_live(con, timestamp, timestamp + main_task.id_renewal_fail_seconds); if (rv < 0) { log_erros(sp, "wdmd_test_live failed %d", rv); goto fail_close; } sp->wd_fd = con; return 0; fail_close: close(con); fail: return -1; } void unlink_watchdog_file(struct space *sp) { int rv; if (!com.use_watchdog) return; log_space(sp, "wdmd_test_live 0 0 to disable"); rv = wdmd_test_live(sp->wd_fd, 0, 0); if (rv < 0) log_erros(sp, "wdmd_test_live failed %d", rv); } void close_watchdog_file(struct space *sp) { if (!com.use_watchdog) return; close(sp->wd_fd); } void close_watchdog(void) { if (!com.use_watchdog) return; wdmd_refcount_clear(daemon_wdmd_con); close(daemon_wdmd_con); } /* TODO: add wdmd connection as client so poll detects if it fails? */ int setup_watchdog(void) { char name[WDMD_NAME_SIZE]; int test_interval, fire_timeout; uint64_t last_keepalive; int con, rv; if (!com.use_watchdog) return 0; memset(name, 0, sizeof(name)); snprintf(name, WDMD_NAME_SIZE - 1, "%s", "sanlock_daemon"); con = wdmd_connect(); if (con < 0) { log_error("wdmd connect failed for watchdog handling"); goto fail; } rv = wdmd_register(con, name); if (rv < 0) { log_error("wdmd register failed"); goto fail_close; } rv = wdmd_refcount_set(con); if (rv < 0) { log_error("wdmd refcount failed"); goto fail_close; } rv = wdmd_status(con, &test_interval, &fire_timeout, &last_keepalive); if (rv < 0) { log_error("wdmd status failed"); goto fail_clear; } log_debug("wdmd test_interval %d fire_timeout %d last_keepalive %llu", test_interval, fire_timeout, (unsigned long long)last_keepalive); if (fire_timeout != WATCHDOG_FIRE_TIMEOUT) { log_error("invalid watchdog fire_timeout %d vs %d", fire_timeout, WATCHDOG_FIRE_TIMEOUT); goto fail_clear; } daemon_wdmd_con = con; return 0; fail_clear: wdmd_refcount_clear(con); fail_close: close(con); fail: return -1; } sanlock-2.2/src/cmd.h0000644000175100017510000000123711751766670013531 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __CMD_H__ #define __CMD_H__ struct cmd_args { struct list_head list; /* thread_pool data */ int ci_in; int ci_target; int cl_fd; int cl_pid; struct sm_header header; }; /* cmds processed by thread pool */ void call_cmd_thread(struct task *task, struct cmd_args *ca); /* cmds processed by main loop */ void call_cmd_daemon(int ci, struct sm_header *h_recv, int client_maxi); #endif sanlock-2.2/src/Makefile0000644000175100017510000000647311751766670014264 0ustar weberweber# Copyright 2010-2011 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v2 or (at your option) any later version. CMD_TARGET = sanlock HEADER_TARGET = sanlock.h sanlock_rv.h sanlock_resource.h sanlock_admin.h sanlock_direct.h MAN_TARGET = sanlock.8 SOMAJOR=1 SOMINOR=0 LIB_ENTIRE_TARGET = libsanlock LIB_CLIENT_TARGET = libsanlock_client LIBSO_ENTIRE_TARGET = $(LIB_ENTIRE_TARGET).so.$(SOMAJOR).$(SOMINOR) LIBSO_CLIENT_TARGET = $(LIB_CLIENT_TARGET).so.$(SOMAJOR).$(SOMINOR) CMD_SOURCE = \ crc32c.c \ delta_lease.c \ direct.c \ diskio.c \ lockspace.c \ lockfile.c \ log.c \ main.c \ paxos_lease.c \ task.c \ resource.c \ watchdog.c \ monotime.c \ cmd.c \ client_cmd.c \ sanlock_sock.c LIB_ENTIRE_SOURCE = \ client.c \ sanlock_sock.c \ crc32c.c \ diskio.c \ delta_lease.c \ paxos_lease.c \ direct.c \ task.c \ direct_lib.c \ monotime.c LIB_CLIENT_SOURCE = \ client.c \ sanlock_sock.c CFLAGS += -D_GNU_SOURCE -g \ -Wall \ -Wformat \ -Wformat-security \ -Wmissing-prototypes \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wp,-D_FORTIFY_SOURCE=2 \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option CMD_CFLAGS = $(CFLAGS) -fPIE -DPIE CMD_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie CMD_LDADD += -lpthread -luuid -lrt -laio -lblkid -lsanlock -L../wdmd -lwdmd LIB_ENTIRE_LDFLAGS += -lpthread -lrt -laio -lblkid -L../wdmd -lwdmd LIB_ENTIRE_LDFLAGS += -Wl,-z,relro -pie LIB_CLIENT_LDFLAGS += -Wl,-z,relro -pie all: $(LIBSO_ENTIRE_TARGET) $(LIBSO_CLIENT_TARGET) $(CMD_TARGET) $(LIBSO_ENTIRE_TARGET): $(LIB_ENTIRE_SOURCE) $(CC) $(CFLAGS) $(LIB_ENTIRE_LDFLAGS) -shared -fPIC -o $@ -Wl,-soname=$(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $^ ln -sf $(LIBSO_ENTIRE_TARGET) $(LIB_ENTIRE_TARGET).so ln -sf $(LIBSO_ENTIRE_TARGET) $(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $(LIBSO_CLIENT_TARGET): $(LIB_CLIENT_SOURCE) $(CC) $(CFLAGS) $(LIB_CLIENT_LDFLAGS) -shared -fPIC -o $@ -Wl,-soname=$(LIB_CLIENT_TARGET).so.$(SOMAJOR) $^ ln -sf $(LIBSO_CLIENT_TARGET) $(LIB_CLIENT_TARGET).so ln -sf $(LIBSO_CLIENT_TARGET) $(LIB_CLIENT_TARGET).so.$(SOMAJOR) $(CMD_TARGET): $(LIBSO_ENTIRE_TARGET) $(CMD_SOURCE) $(CC) $(CMD_CFLAGS) $(CMD_LDFLAGS) $(CMD_SOURCE) $(CMD_LDADD) -o $@ -L. clean: rm -f *.o *.so *.so.* $(CMD_TARGET) INSTALL=$(shell which install) DESTDIR= BINDIR=/usr/sbin LIBDIR=/usr/lib64 HEADIR=/usr/include MANDIR=/usr/share/man .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(LIBDIR) $(INSTALL) -d $(DESTDIR)/$(HEADIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -c -m 755 $(CMD_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -c -m 755 $(LIBSO_ENTIRE_TARGET) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 755 $(LIBSO_CLIENT_TARGET) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_ENTIRE_TARGET).so $(DESTDIR)/$(LIBDIR) cp -a $(LIB_CLIENT_TARGET).so $(DESTDIR)/$(LIBDIR) cp -a $(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_CLIENT_TARGET).so.$(SOMAJOR) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 644 $(HEADER_TARGET) $(DESTDIR)/$(HEADIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ sanlock-2.2/src/lockspace.h0000644000175100017510000000242511751766670014732 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __HOST_ID_H__ #define __HOST_ID__H__ struct space *find_lockspace(char *name); int _lockspace_info(char *space_name, struct space *sp_out); int lockspace_info(char *space_name, struct space *sp_out); int lockspace_disk(char *space_name, struct sync_disk *disk); int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); int host_status_set_bit(char *space_name, uint64_t host_id); int test_id_bit(int host_id, char *bitmap); void set_id_bit(int host_id, char *bitmap, char *c); int check_our_lease(struct task *task, struct space *sp, int *check_all, char *check_buf); void check_other_leases(struct task *task, struct space *sp, char *buf); int add_lockspace_start(struct sanlk_lockspace *ls, struct space **sp_out); int add_lockspace_wait(struct space *sp); int inq_lockspace(struct sanlk_lockspace *ls); int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id); int rem_lockspace_wait(struct sanlk_lockspace *ls, unsigned int space_id); void free_lockspaces(int wait); #endif sanlock-2.2/src/direct_lib.c0000644000175100017510000000706711751766670015070 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #define EXTERN #include "sanlock_internal.h" #include "sanlock_direct.h" #include "diskio.h" #include "direct.h" #include "task.h" void log_level(uint32_t space_id GNUC_UNUSED, uint32_t token_id GNUC_UNUSED, char *name GNUC_UNUSED, int level GNUC_UNUSED, const char *fmt GNUC_UNUSED, ...); void log_level(uint32_t space_id GNUC_UNUSED, uint32_t token_id GNUC_UNUSED, char *name GNUC_UNUSED, int level GNUC_UNUSED, const char *fmt GNUC_UNUSED, ...) { } int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED); int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED) { return -1; } int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); int host_info(char *space_name GNUC_UNUSED, uint64_t host_id GNUC_UNUSED, struct host_status *hs_out GNUC_UNUSED) { return -1; } struct token; void check_mode_block(struct token *token GNUC_UNUSED, int q GNUC_UNUSED, char *dblock GNUC_UNUSED); void check_mode_block(struct token *token GNUC_UNUSED, int q GNUC_UNUSED, char *dblock GNUC_UNUSED) { } /* copied from host_id.c */ int test_id_bit(int host_id, char *bitmap); int test_id_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; return (*byte & mask); } int get_rand(int a, int b); int get_rand(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } static void setup_task_lib(struct task *task, int use_aio, int io_timeout_sec) { memset(task, 0, sizeof(struct task)); if (!io_timeout_sec) io_timeout_sec = DEFAULT_IO_TIMEOUT; setup_task_timeouts(task, io_timeout_sec); setup_task_aio(task, use_aio, LIB_AIO_CB_SIZE); sprintf(task->name, "%s", "lib"); } int sanlock_direct_read_id(struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int use_aio, int io_timeout_sec) { struct task task; int rv; setup_task_lib(&task, use_aio, io_timeout_sec); rv = direct_read_id(&task, ls, timestamp, owner_id, owner_generation); close_task_aio(&task); return rv; } int sanlock_direct_live_id(struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int *live, int use_aio, int io_timeout_sec) { struct task task; int rv; setup_task_lib(&task, use_aio, io_timeout_sec); rv = direct_live_id(&task, ls, timestamp, owner_id, owner_generation, live); close_task_aio(&task); return rv; } int sanlock_direct_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts, int use_aio) { struct task task; int rv; setup_task_lib(&task, use_aio, DEFAULT_IO_TIMEOUT); rv = direct_init(&task, ls, res, max_hosts, num_hosts); close_task_aio(&task); return rv; } int sanlock_direct_align(struct sanlk_disk *disk_in) { struct sync_disk disk; int align_size, rv; memset(&disk, 0, sizeof(disk)); memcpy(disk.path, disk_in->path, SANLK_PATH_LEN); rv = open_disk(&disk); if (rv < 0) return rv; align_size = direct_align(&disk); close(disk.fd); return align_size; } sanlock-2.2/src/client_cmd.c0000644000175100017510000002453011751766670015063 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_sock.h" #include "client_cmd.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif extern int send_command(int cmd, uint32_t data); static void print_debug(char *str, int len) { char *p; int i; p = &str[0]; for (i = 0; i < len-1; i++) { if (str[i] == ' ') { str[i] = '\0'; printf(" %s\n", p); p = &str[i+1]; } } if (p) printf(" %s\n", p); } static void status_daemon(struct sanlk_state *st, char *str, int debug) { printf("daemon %.48s\n", st->name); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_client(struct sanlk_state *st, char *str, int debug) { printf("p %d ", st->data32); printf("%.48s\n", st->name); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_lockspace(struct sanlk_state *st, char *str, char *bin, int debug) { struct sanlk_lockspace *ls = (struct sanlk_lockspace *)bin; printf("s %.48s:%llu:%s:%llu\n", ls->name, (unsigned long long)ls->host_id, ls->host_id_disk.path, (unsigned long long)ls->host_id_disk.offset); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_resource(struct sanlk_state *st, char *str, char *bin, int debug) { struct sanlk_resource *res = (struct sanlk_resource *)bin; struct sanlk_disk *disk; int i; printf("r %.48s:%.48s", res->lockspace_name, res->name); for (i = 0; i < res->num_disks; i++) { disk = (struct sanlk_disk *)(bin + sizeof(struct sanlk_resource) + i * sizeof(struct sanlk_disk)); printf(":%s:%llu", disk->path, (unsigned long long)disk->offset); } if (res->flags & SANLK_RES_SHARED) printf(":SH p %u\n", st->data32); else printf(":%llu p %u\n", (unsigned long long)st->data64, st->data32); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_host(struct sanlk_state *st, char *str, int debug) { printf("%u timestamp %llu\n", st->data32, (unsigned long long)st->data64); if (st->str_len && debug) print_debug(str, st->str_len); } static void print_st(struct sanlk_state *st, char *str, char *bin, int debug) { switch (st->type) { case SANLK_STATE_DAEMON: status_daemon(st, str, debug); break; case SANLK_STATE_CLIENT: status_client(st, str, debug); break; case SANLK_STATE_LOCKSPACE: status_lockspace(st, str, bin, debug); break; case SANLK_STATE_RESOURCE: status_resource(st, str, bin, debug); break; } } #define MAX_SORT_ENTRIES 1024 static char *sort_bufs[MAX_SORT_ENTRIES]; static int sort_count; static int sort_done; static void print_type(int type, int debug) { struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (!type || st->type == type) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static void print_p(int p, int debug) { struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (st->type != SANLK_STATE_CLIENT) continue; if (st->data32 == p) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static int find_type(int type, int *sort_index) { struct sanlk_state *st; char *buf; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; if (st->type == type) { *sort_index = i; return 0; } } return -1; } static void print_r(int p, char *s, int debug) { struct sanlk_resource *res; struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (st->type != SANLK_STATE_RESOURCE) continue; res = (struct sanlk_resource *)bin; if ((p && st->data32 == p) || (s && !strncmp(s, res->lockspace_name, SANLK_NAME_LEN))) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static void print_r_by_p(int debug) { struct sanlk_state *st; char *buf, *str, *bin; int rv, i; while (1) { rv = find_type(SANLK_STATE_CLIENT, &i); if (rv < 0) return; buf = sort_bufs[i]; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; print_st(st, str, bin, debug); print_r(st->data32, NULL, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } static void print_r_by_s(int debug) { struct sanlk_state *st; char *buf, *str, *bin; int rv, i; while (1) { rv = find_type(SANLK_STATE_LOCKSPACE, &i); if (rv < 0) return; buf = sort_bufs[i]; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; print_st(st, str, bin, debug); print_r(0, st->name, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } static void recv_bin(int fd, struct sanlk_state *st, char *bin) { struct sanlk_resource *res; if (st->type == SANLK_STATE_LOCKSPACE) { recv(fd, bin, sizeof(struct sanlk_lockspace), MSG_WAITALL); } else if (st->type == SANLK_STATE_RESOURCE) { recv(fd, bin, sizeof(struct sanlk_resource), MSG_WAITALL); res = (struct sanlk_resource *)bin; recv(fd, bin+sizeof(struct sanlk_resource), res->num_disks * sizeof(struct sanlk_disk), MSG_WAITALL); } } int sanlock_status(int debug, char sort_arg) { struct sm_header h; struct sanlk_state state; char maxstr[SANLK_STATE_MAXSTR]; char maxbin[SANLK_STATE_MAXSTR]; struct sanlk_state *st; char *buf, *str, *bin; int fd, rv, len; int sort_p = 0, sort_s = 0; if (sort_arg == 'p') sort_p = 1; else if (sort_arg == 's') sort_s = 1; fd = send_command(SM_CMD_STATUS, 0); if (fd < 0) return fd; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } st = &state; str = maxstr; bin = maxbin; while (1) { if (sort_p || sort_s) { len = sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR*4; buf = malloc(len); if (!buf) return -ENOMEM; memset(buf, 0, len); st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; } else { memset(&state, 0, sizeof(state)); memset(maxstr, 0, sizeof(maxstr)); memset(maxbin, 0, sizeof(maxbin)); } rv = recv(fd, st, sizeof(struct sanlk_state), MSG_WAITALL); if (!rv) break; if (rv != sizeof(struct sanlk_state)) break; if (st->str_len) { rv = recv(fd, str, st->str_len, MSG_WAITALL); if (rv != st->str_len) break; } recv_bin(fd, st, bin); if (sort_p || sort_s) { if (sort_count == MAX_SORT_ENTRIES) { printf("cannot sort over %d\n", MAX_SORT_ENTRIES); goto out; } sort_bufs[sort_count++] = buf; continue; } /* no sorting, print as received */ print_st(st, str, bin, debug); } if (sort_p) { print_type(SANLK_STATE_DAEMON, debug); print_p(-1, debug); print_type(SANLK_STATE_LOCKSPACE, debug); print_r_by_p(debug); if (sort_done < sort_count) { printf("-\n"); print_type(0, debug); } } else if (sort_s) { print_type(SANLK_STATE_DAEMON, debug); print_p(-1, debug); print_type(SANLK_STATE_CLIENT, debug); print_r_by_s(debug); if (sort_done < sort_count) { printf("-\n"); print_type(0, debug); } } rv = 0; out: close(fd); return rv; } int sanlock_host_status(int debug, char *lockspace_name) { struct sm_header h; struct sanlk_state st; struct sanlk_lockspace lockspace; char str[SANLK_STATE_MAXSTR]; int fd, rv; if (!lockspace_name || !lockspace_name[0]) return -1; fd = send_command(SM_CMD_HOST_STATUS, 0); if (fd < 0) return fd; memset(&lockspace, 0, sizeof(lockspace)); snprintf(lockspace.name, SANLK_NAME_LEN, "%s", lockspace_name); rv = send(fd, &lockspace, sizeof(lockspace), 0); if (rv < 0) goto out; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } while (1) { rv = recv(fd, &st, sizeof(st), MSG_WAITALL); if (!rv) break; if (rv != sizeof(st)) break; if (st.str_len) { rv = recv(fd, str, st.str_len, MSG_WAITALL); if (rv != st.str_len) break; } switch (st.type) { case SANLK_STATE_HOST: status_host(&st, str, debug); break; } } rv = h.data; out: close(fd); return rv; } int sanlock_log_dump(int max_size) { struct sm_header h; char *buf; int fd, rv; buf = malloc(max_size); if (!buf) return -ENOMEM; memset(buf, 0, max_size); fd = send_command(SM_CMD_LOG_DUMP, 0); if (fd < 0) { free(buf); return fd; } memset(&h, 0, sizeof(h)); rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } if (h.data <= 0 || h.data > max_size) goto out; rv = recv(fd, buf, h.data, MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (!rv) { rv = -1; goto out; } printf("%s", buf); printf("\n"); if (rv != h.data) printf("partial dump %d of %d\n", rv, h.data); out: close(fd); free(buf); return rv; } int sanlock_shutdown(uint32_t force) { int fd; fd = send_command(SM_CMD_SHUTDOWN, force); if (fd < 0) return fd; close(fd); return 0; } sanlock-2.2/src/resource.h0000644000175100017510000000155511751766670014620 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __RESOURCE_H__ #define __RESOURCE_H__ void send_state_resources(int fd); int lockspace_is_used(struct sanlk_lockspace *ls); void check_mode_block(struct token *token, int q, char *dblock); int acquire_token(struct task *task, struct token *token); int release_token(struct task *task, struct token *token); void release_token_async(struct token *token); int request_token(struct task *task, struct token *token, uint32_t force_mode, uint64_t *owner_id); int set_resource_examine(char *space_name, char *res_name); int setup_token_manager(void); void close_token_manager(void); #endif sanlock-2.2/src/client_cmd.h0000644000175100017510000000077511751766670015075 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __CLIENT_CMD_H__ #define __CLIENT_CMD_H__ int sanlock_status(int debug, char sort_arg); int sanlock_host_status(int debug, char *lockspace_name); int sanlock_log_dump(int max_size); int sanlock_shutdown(uint32_t force); #endif sanlock-2.2/src/task.h0000644000175100017510000000075211751766670013731 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __TASK_H__ #define __TASK_H__ void setup_task_timeouts(struct task *task, int io_timeout_arg); void setup_task_aio(struct task *task, int use_aio, int cb_size); void close_task_aio(struct task *task); #endif sanlock-2.2/src/client.c0000644000175100017510000004135511751766670014244 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" #include "sanlock_admin.h" #include "sanlock_sock.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif static int connect_socket(int *sock_fd) { int rv, s; struct sockaddr_un addr; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; rv = sanlock_socket_address(&addr); if (rv < 0) return rv; rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } *sock_fd = s; return 0; } static int send_header(int sock, int cmd, uint32_t cmd_flags, int datalen, uint32_t data, uint32_t data2) { struct sm_header header; int rv; memset(&header, 0, sizeof(struct sm_header)); header.magic = SM_MAGIC; header.cmd = cmd; header.cmd_flags = cmd_flags; header.length = sizeof(header) + datalen; header.data = data; header.data2 = data2; rv = send(sock, (void *) &header, sizeof(struct sm_header), 0); if (rv < 0) return -errno; return 0; } int send_command(int cmd, uint32_t data); int send_command(int cmd, uint32_t data) { int rv, sock; rv = connect_socket(&sock); if (rv < 0) return rv; rv = send_header(sock, cmd, 0, 0, data, 0); if (rv < 0) { close(sock); return rv; } return sock; } static int recv_result(int fd) { struct sm_header h; int rv; memset(&h, 0, sizeof(struct sm_header)); rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) return -errno; if (rv != sizeof(h)) return -1; return (int)h.data; } static int cmd_lockspace(int cmd, struct sanlk_lockspace *ls, uint32_t flags) { int rv, fd; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, cmd, flags, sizeof(struct sanlk_lockspace), 0, 0); if (rv < 0) goto out; rv = send(fd, (void *)ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_add_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_ADD_LOCKSPACE, ls, flags); } int sanlock_inq_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_INQ_LOCKSPACE, ls, flags); } int sanlock_rem_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_REM_LOCKSPACE, ls, flags); } int sanlock_align(struct sanlk_disk *disk) { int rv, fd; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_ALIGN, 0, sizeof(struct sanlk_disk), 0, 0); if (rv < 0) goto out; rv = send(fd, (void *)disk, sizeof(struct sanlk_disk), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts) { int rv, fd, cmd, datalen; if (!ls && !res) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; if (ls && ls->host_id_disk.path[0]) { cmd = SM_CMD_INIT_LOCKSPACE; datalen = sizeof(struct sanlk_lockspace); } else { cmd = SM_CMD_INIT_RESOURCE; datalen = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks; } rv = send_header(fd, cmd, 0, datalen, max_hosts, num_hosts); if (rv < 0) goto out; if (ls) { rv = send(fd, ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } } else { rv = send(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } } rv = recv_result(fd); out: close(fd); return rv; } /* src has colons unescaped, dst should have them escaped with backslash */ static void copy_path_out(char *dst, char *src) { int i, j = 0; for (i = 0; i < strlen(src); i++) { if (src[i] == ':') dst[j++] = '\\'; dst[j++] = src[i]; } } /* src has colons escaped with backslash, dst should have backslash removed */ static void copy_path_in(char *dst, char *src) { int i, j = 0; for (i = 0; i < strlen(src); i++) { if (src[i] == '\\') continue; dst[j++] = src[i]; } } int sanlock_register(void) { int sock, rv; rv = connect_socket(&sock); if (rv < 0) return rv; rv = send_header(sock, SM_CMD_REGISTER, 0, 0, 0, 0); if (rv < 0) { close(sock); return rv; } return sock; } int sanlock_restrict(int sock, uint32_t flags) { int rv; rv = send_header(sock, SM_CMD_RESTRICT, flags, 0, 0, -1); if (rv < 0) return rv; rv = recv_result(sock); return rv; } int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[], struct sanlk_options *opt_in) { struct sanlk_resource *res; struct sanlk_options opt; int rv, i, fd, data2; int datalen = 0; if (res_count > SANLK_MAX_RESOURCES) return -EINVAL; for (i = 0; i < res_count; i++) { res = res_args[i]; datalen += sizeof(struct sanlk_resource); if (res->num_disks > SANLK_MAX_DISKS) return -EINVAL; datalen += (res->num_disks * sizeof(struct sanlk_disk)); } datalen += sizeof(struct sanlk_options); if (opt_in) { memcpy(&opt, opt_in, sizeof(struct sanlk_options)); datalen += opt_in->len; } else { memset(&opt, 0, sizeof(opt)); } if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } rv = send_header(fd, SM_CMD_ACQUIRE, flags, datalen, res_count, data2); if (rv < 0) return rv; for (i = 0; i < res_count; i++) { res = res_args[i]; rv = send(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -1; goto out; } rv = send(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -1; goto out; } } rv = send(fd, &opt, sizeof(struct sanlk_options), 0); if (rv < 0) { rv = -1; goto out; } if (opt.len) { rv = send(fd, opt_in->str, opt.len, 0); if (rv < 0) { rv = -1; goto out; } } rv = recv_result(fd); out: if (sock == -1) close(fd); return rv; } int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, char **res_state) { struct sm_header h; char *reply_data = NULL; int rv, fd, data2, len; *res_count = 0; if (res_state) *res_state = NULL; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } rv = send_header(fd, SM_CMD_INQUIRE, flags, 0, 0, data2); if (rv < 0) return rv; /* get result */ memset(&h, 0, sizeof(h)); rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; } len = h.length - sizeof(h); if (!len) { rv = (int)h.data; goto out; } reply_data = malloc(len); if (!reply_data) { rv = -ENOMEM; goto out; } rv = recv(fd, reply_data, len, MSG_WAITALL); if (rv != len) { free(reply_data); rv = -1; goto out; } if (res_state) *res_state = reply_data; else free(reply_data); *res_count = (int)h.data2; rv = (int)h.data; out: if (sock == -1) close(fd); return rv; } /* tell daemon to release lease(s) for given pid. I don't think the pid itself will usually tell sm to release leases, but it will be requested by a manager overseeing the pid */ int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]) { int fd, rv, i, data2, datalen; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } datalen = res_count * sizeof(struct sanlk_resource); rv = send_header(fd, SM_CMD_RELEASE, flags, datalen, res_count, data2); if (rv < 0) goto out; for (i = 0; i < res_count; i++) { rv = send(fd, res_args[i], sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -1; goto out; } } rv = recv_result(fd); out: if (sock == -1) close(fd); return rv; } int sanlock_request(uint32_t flags, uint32_t force_mode, struct sanlk_resource *res) { int fd, rv, datalen; datalen = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_REQUEST, flags, datalen, force_mode, 0); if (rv < 0) goto out; rv = send(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_examine(uint32_t flags, struct sanlk_lockspace *ls, struct sanlk_resource *res) { char *data; int rv, fd, cmd, datalen; if (!ls && !res) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; if (ls && ls->host_id_disk.path[0]) { cmd = SM_CMD_EXAMINE_LOCKSPACE; datalen = sizeof(struct sanlk_lockspace); data = (char *)ls; } else { cmd = SM_CMD_EXAMINE_RESOURCE; datalen = sizeof(struct sanlk_resource); data = (char *)res; } rv = send_header(fd, cmd, flags, datalen, 0, 0); if (rv < 0) goto out; rv = send(fd, data, datalen, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } /* * convert from struct sanlk_resource to string with format: * :::[::...]: */ int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret) { char path[SANLK_PATH_LEN + 1]; char *str; int ret, len, pos, d; str = malloc(SANLK_MAX_RES_STR + 1); if (!str) return -ENOMEM; memset(str, 0, SANLK_MAX_RES_STR + 1); len = SANLK_MAX_RES_STR; pos = 0; ret = snprintf(str + pos, len - pos, "%s:%s", res->lockspace_name, res->name); if (ret >= len - pos) goto fail; pos += ret; for (d = 0; d < res->num_disks; d++) { memset(path, 0, sizeof(path)); copy_path_out(path, res->disks[d].path); ret = snprintf(str + pos, len - pos, ":%s:%llu", path, (unsigned long long)res->disks[d].offset); if (ret >= len - pos) goto fail; pos += ret; } if (res->flags & SANLK_RES_SHARED) ret = snprintf(str + pos, len - pos, ":SH"); else ret = snprintf(str + pos, len - pos, ":%llu", (unsigned long long)res->lver); if (ret > len - pos) goto fail; pos += ret; if (pos > len) goto fail; *str_ret = str; return 0; fail: free(str); return -EINVAL; } /* * convert to struct sanlk_resource from string with format: * :::[::...][:] */ int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret) { struct sanlk_resource *res; char sub[SANLK_PATH_LEN + 1]; int i, j, d, rv, len, sub_count, colons, num_disks, have_lver; if (strlen(str) < 3) return -ENXIO; colons = 0; for (i = 0; i < strlen(str); i++) { if (str[i] == '\\') { i++; continue; } if (str[i] == ':') colons++; } if (!colons || (colons == 2)) { return -1; } num_disks = (colons - 1) / 2; have_lver = (colons - 1) % 2; if (num_disks > SANLK_MAX_DISKS) return -2; len = sizeof(struct sanlk_resource) + num_disks * sizeof(struct sanlk_disk); res = malloc(len); if (!res) return -ENOMEM; memset(res, 0, len); res->num_disks = num_disks; d = 0; sub_count = 0; j = 0; memset(sub, 0, sizeof(sub)); len = strlen(str); for (i = 0; i < len + 1; i++) { if (str[i] == '\\') { if (i == (len - 1)) goto fail; i++; sub[j++] = str[i]; continue; } if (i < len && str[i] != ':') { if (j >= SANLK_PATH_LEN) goto fail; sub[j++] = str[i]; continue; } /* do something with sub when we hit ':' or end of str, first and second subs are lockspace and resource names, then even sub is path, odd sub is offset */ if (sub_count < 2 && strlen(sub) > SANLK_NAME_LEN) goto fail; if (sub_count >= 2 && (strlen(sub) > SANLK_PATH_LEN-1 || strlen(sub) < 1)) goto fail; if (sub_count == 0) { strncpy(res->lockspace_name, sub, SANLK_NAME_LEN); } else if (sub_count == 1) { strncpy(res->name, sub, SANLK_NAME_LEN); } else if (!(sub_count % 2)) { if (have_lver && (d == num_disks)) { if (!strncmp(sub, "SH", 2)) { res->flags |= SANLK_RES_SHARED; } else { res->flags |= SANLK_RES_LVER; res->lver = strtoull(sub, NULL, 0); } } else { strncpy(res->disks[d].path, sub, SANLK_PATH_LEN - 1); } } else { rv = sscanf(sub, "%llu", (unsigned long long *)&res->disks[d].offset); if (rv != 1) goto fail; d++; } sub_count++; j = 0; memset(sub, 0, sizeof(sub)); } *res_ret = res; return 0; fail: free(res); return -1; } /* * convert from array of struct sanlk_resource * to state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_res_to_str() comment */ int sanlock_args_to_state(int res_count, struct sanlk_resource *res_args[], char **res_state) { char *str, *state; int i, rv; state = malloc(res_count * (SANLK_MAX_RES_STR + 1)); if (!state) return -ENOMEM; memset(state, 0, res_count * (SANLK_MAX_RES_STR + 1)); for (i = 0; i < res_count; i++) { str = NULL; rv = sanlock_res_to_str(res_args[i], &str); if (rv < 0 || !str) { free(state); return rv; } if (strlen(str) > SANLK_MAX_RES_STR - 1) { free(str); free(state); return -EINVAL; } /* space is str separator, so it's invalid within each str */ if (strstr(str, " ")) { free(str); free(state); return -EINVAL; } if (i) strcat(state, " "); strcat(state, str); free(str); } /* caller to free state */ *res_state = state; return 0; } /* * convert to array of struct sanlk_resource * from state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_str_to_res() comment */ int sanlock_state_to_args(char *res_state, int *res_count, struct sanlk_resource ***res_args) { struct sanlk_resource **args; struct sanlk_resource *res; char str[SANLK_MAX_RES_STR + 1]; int count = 1, arg_count = 0; int i, j, len, rv; for (i = 0; i < strlen(res_state); i++) { if (res_state[i] == ' ') count++; } *res_count = count; args = malloc(count * sizeof(*args)); if (!args) return -ENOMEM; memset(args, 0, count * sizeof(*args)); j = 0; memset(str, 0, sizeof(str)); len = strlen(res_state); for (i = 0; i < len + 1; i++) { if (i < len && res_state[i] != ' ') { str[j++] = res_state[i]; continue; } rv = sanlock_str_to_res(str, &res); if (rv < 0 || !res) goto fail_free; if (arg_count == count) goto fail_free; args[arg_count++] = res; j = 0; memset(str, 0, sizeof(str)); } /* caller to free res_count res and args */ *res_count = arg_count; *res_args = args; return 0; fail_free: for (i = 0; i < count; i++) { if (args[i]) free(args[i]); } free(args); return rv; } /* * convert to struct sanlk_lockspace from string with format: * ::: */ int sanlock_str_to_lockspace(char *str, struct sanlk_lockspace *ls) { char *host_id = NULL; char *path = NULL; char *offset = NULL; int i; if (!str) return -EINVAL; for (i = 0; i < strlen(str); i++) { if (str[i] == '\\') { i++; continue; } if (str[i] == ':') { if (!host_id) host_id = &str[i]; else if (!path) path = &str[i]; else if (!offset) offset = &str[i]; } } if (host_id) { *host_id = '\0'; host_id++; } if (path) { *path = '\0'; path++; } if (offset) { *offset= '\0'; offset++; } strncpy(ls->name, str, SANLK_NAME_LEN); if (host_id) ls->host_id = atoll(host_id); if (path) copy_path_in(ls->host_id_disk.path, path); if (offset) ls->host_id_disk.offset = atoll(offset); return 0; } sanlock-2.2/src/log.h0000644000175100017510000000335411751766670013551 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LOG_H__ #define __LOG_H__ void log_level(uint32_t space_id, uint32_t token_id, char *name_in, int level, const char *fmt, ...) __attribute__((format(printf, 5, 6))); int setup_logging(void); void close_logging(void); void copy_log_dump(char *buf, int *len); #define log_debug(fmt, args...) log_level(0, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_space(space, fmt, args...) log_level(space->space_id, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_token(token, fmt, args...) log_level(0, token->token_id, NULL, LOG_DEBUG, fmt, ##args) #define log_spoke(space, token, fmt, args...) log_level(space->space_id, token->token_id, NULL, LOG_DEBUG, fmt, ##args) #define log_error(fmt, args...) log_level(0, 0, NULL, LOG_ERR, fmt, ##args) #define log_erros(space, fmt, args...) log_level(space->space_id, 0, NULL, LOG_ERR, fmt, ##args) #define log_errot(token, fmt, args...) log_level(0, token->token_id, NULL, LOG_ERR, fmt, ##args) #define log_errst(space, token, fmt, args...) log_level(space->space_id, token->token_id, NULL, LOG_ERR, fmt, ##args) #define log_taske(task, fmt, args...) log_level(0, 0, task->name, LOG_ERR, fmt, ##args) #define log_taskd(task, fmt, args...) log_level(0, 0, task->name, LOG_DEBUG, fmt, ##args) /* use log_tool for tool actions (non-daemon), and for daemon until logging is set up */ #define log_tool(fmt, args...) \ do { \ fprintf(stderr, fmt "\n", ##args); \ } while (0) #endif sanlock-2.2/src/leader.h0000644000175100017510000000472511751766670014227 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LEADER_H__ #define __LEADER_H__ /* does not include terminating null byte */ /* NB NAME_ID_SIZE must match SANLK_NAME_LEN */ /* NB NAME_ID_SIZE is part of ondisk format */ #define NAME_ID_SIZE 48 #define PAXOS_DISK_MAGIC 0x06152010 #define PAXOS_DISK_VERSION_MAJOR 0x00060000 #define PAXOS_DISK_VERSION_MINOR 0x00000001 #define DELTA_DISK_MAGIC 0x12212010 #define DELTA_DISK_VERSION_MAJOR 0x00030000 #define DELTA_DISK_VERSION_MINOR 0x00000002 /* for all disk structures: uint64 aligned on 8 byte boundaries, uint32 aligned on 4 byte boundaries, etc */ /* NB. adjust LEADER_COMPARE_LEN and LEADER_CHECKSUM_LEN when changing this struct. LEADER_CHECKSUM_LEN should end just before the checksum field. LEADER_COMPARE_LEN should end just before timestamp. The checksum field should follow the timestamp field. The leader may be partially through updating the timestamp on multiple leader blocks in a lease, but for the purpose of counting repetitions of a leader block owned by a single host they should be counted together, so COMPARE_LEN should exclude timestamp. */ #define LEADER_COMPARE_LEN 152 #define LEADER_CHECKSUM_LEN 168 #define LEASE_FREE 0 #define LFL_SHORT_HOLD 0x00000001 struct leader_record { uint32_t magic; uint32_t version; uint32_t flags; uint32_t sector_size; uint64_t num_hosts; uint64_t max_hosts; uint64_t owner_id; /* host_id of owner */ uint64_t owner_generation; uint64_t lver; char space_name[NAME_ID_SIZE]; /* lockspace for resource */ char resource_name[NAME_ID_SIZE]; /* resource being locked */ uint64_t timestamp; uint64_t unused1; uint32_t checksum; uint32_t unused2; uint64_t write_id; /* for extra info, debug */ uint64_t write_generation; /* for extra info, debug */ uint64_t write_timestamp; /* for extra info, debug */ }; /* leader_record can use first 256 bytes of a sector, bitmap uses the last 256 bytes */ #define LEADER_RECORD_MAX 256 #define HOSTID_BITMAP_OFFSET 256 #define HOSTID_BITMAP_SIZE 256 #define REQ_DISK_MAGIC 0x08292011 #define REQ_DISK_VERSION_MAJOR 0x00010000 #define REQ_DISK_VERSION_MINOR 0x00000001 struct request_record { uint32_t magic; uint32_t version; uint64_t lver; uint32_t force_mode; }; #endif sanlock-2.2/src/sanlock_admin.h0000644000175100017510000000367111751766670015574 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_ADMIN_H__ #define __SANLOCK_ADMIN_H__ /* add flags */ #define SANLK_ADD_ASYNC 0x00000001 /* rem flags */ #define SANLK_REM_ASYNC 0x00000001 #define SANLK_REM_UNUSED 0x00000002 /* * add_lockspace returns: * 0: the lockspace has been added successfully * -EEXIST: the lockspace already exists * -EINPROGRESS: the lockspace is already in the process of being added * (the in-progress add may or may not succeed) * -EAGAIN: the lockspace is being removed */ int sanlock_add_lockspace(struct sanlk_lockspace *ls, uint32_t flags); /* * inq_lockspace returns: * 0: the lockspace exists and is currently held * -ENOENT: lockspace not found */ int sanlock_inq_lockspace(struct sanlk_lockspace *ls, uint32_t flags); /* * rem_lockspace returns: * 0: the lockspace has been removed successfully * -EINPROGRESS: the lockspace is already in the process of being removed * -ENOENT: lockspace not found * -EBUSY: UNUSED was set and lockspace is being used * * The sanlock daemon will kill any pids using the lockspace when the * lockspace is removed (unless UNUSED is set). */ int sanlock_rem_lockspace(struct sanlk_lockspace *ls, uint32_t flags); /* * Returns the alignment in bytes required by sanlock_init() * (1MB for disks with 512 sectors, 8MB for disks with 4096 sectors) */ int sanlock_align(struct sanlk_disk *disk); /* * Ask sanlock daemon to initialize disk space. * Use max_hosts = 0 for default value. * Use num_hosts = 0 for default value. * Provide either lockspace or resource, not both */ int sanlock_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts); #endif sanlock-2.2/src/watchdog.h0000644000175100017510000000112711751766670014564 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __WATCHDOG_H__ #define __WATCHDOG_H__ void update_watchdog_file(struct space *sp, uint64_t timestamp); int create_watchdog_file(struct space *sp, uint64_t timestamp); void unlink_watchdog_file(struct space *sp); void close_watchdog_file(struct space *sp); int setup_watchdog(void); void close_watchdog(void); #endif sanlock-2.2/src/resource.c0000644000175100017510000006147211751766670014617 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "log.h" #include "paxos_lease.h" #include "lockspace.h" #include "resource.h" #include "task.h" #include "mode_block.h" /* from cmd.c */ void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id); /* from main.c */ int get_rand(int a, int b); static pthread_t resource_pt; static int resource_thread_stop; static int resource_thread_work; static struct list_head resources_held; static struct list_head resources_add; static struct list_head resources_rem; static pthread_mutex_t resource_mutex; static pthread_cond_t resource_cond; void send_state_resources(int fd) { struct resource *r; struct token *token; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { list_for_each_entry(token, &r->tokens, list) send_state_resource(fd, r, "held", token->pid, token->token_id); } list_for_each_entry(r, &resources_add, list) { list_for_each_entry(token, &r->tokens, list) send_state_resource(fd, r, "add", token->pid, token->token_id); } list_for_each_entry(r, &resources_rem, list) send_state_resource(fd, r, "rem", r->pid, r->release_token_id); pthread_mutex_unlock(&resource_mutex); } /* return 1 (is alive) to force a failure if we don't have enough knowledge to know it's really not alive. Later we could have this sit and wait (like paxos_lease_acquire) until we have waited long enough or have enough knowledge to say it's safely dead (unless of course we find it is alive while waiting) */ static int host_live(struct task *task, char *lockspace_name, uint64_t host_id, uint64_t gen) { struct host_status hs; uint64_t now; int rv; rv = host_info(lockspace_name, host_id, &hs); if (rv) { log_debug("host_live %llu %llu yes host_info %d", (unsigned long long)host_id, (unsigned long long)gen, rv); return 1; } if (!hs.last_check) { log_debug("host_live %llu %llu yes unchecked", (unsigned long long)host_id, (unsigned long long)gen); return 1; } /* the host_id lease is free, not being used */ if (!hs.timestamp) { log_debug("host_live %llu %llu no lease free", (unsigned long long)host_id, (unsigned long long)gen); return 0; } if (hs.owner_generation > gen) { log_debug("host_live %llu %llu no old gen %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.owner_generation); return 0; } now = monotime(); if (!hs.last_live && (now - hs.first_check > task->host_dead_seconds)) { log_debug("host_live %llu %llu no first_check %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.first_check); return 0; } if (hs.last_live && (now - hs.last_live > task->host_dead_seconds)) { log_debug("host_live %llu %llu no last_live %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.last_live); return 0; } log_debug("host_live %llu %llu yes recent first_check %llu last_live %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.first_check, (unsigned long long)hs.last_live); return 1; } void check_mode_block(struct token *token, int q, char *dblock) { struct mode_block *mb; mb = (struct mode_block *)(dblock + MBLOCK_OFFSET); if (mb->flags & MBLOCK_SHARED) { set_id_bit(q + 1, token->shared_bitmap, NULL); token->shared_count++; } } static int set_mode_block(struct task *task, struct token *token, uint64_t host_id, uint64_t gen, uint32_t flags) { struct sync_disk *disk; struct mode_block *mb; char *iobuf, **p_iobuf; uint64_t offset; int num_disks = token->r.num_disks; int iobuf_len, rv, d; disk = &token->disks[0]; iobuf_len = disk->sector_size; if (!iobuf_len) return -EINVAL; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; offset = disk->offset + ((2 + host_id - 1) * disk->sector_size); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task); if (rv < 0) break; mb = (struct mode_block *)(iobuf + MBLOCK_OFFSET); mb->flags = flags; mb->generation = gen; rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task); if (rv < 0) break; } if (rv < 0) { log_errot(token, "set_mode_block host_id %llu flags %x gen %llu d %d rv %d", (unsigned long long)host_id, flags, (unsigned long long)gen, d, rv); } else { log_token(token, "set_mode_block host_id %llu flags %x gen %llu", (unsigned long long)host_id, flags, (unsigned long long)gen); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } static int read_mode_block(struct task *task, struct token *token, uint64_t host_id, uint64_t *max_gen) { struct sync_disk *disk; struct mode_block *mb; char *iobuf, **p_iobuf; uint64_t offset; uint64_t max = 0; int num_disks = token->r.num_disks; int iobuf_len, rv, d; disk = &token->disks[0]; iobuf_len = disk->sector_size; if (!iobuf_len) return -EINVAL; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; offset = disk->offset + ((2 + host_id - 1) * disk->sector_size); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task); if (rv < 0) break; mb = (struct mode_block *)(iobuf + MBLOCK_OFFSET); if (!(mb->flags & MBLOCK_SHARED)) continue; if (!max || mb->generation > max) max = mb->generation; } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); *max_gen = max; return rv; } static int clear_dead_shared(struct task *task, struct token *token, int num_hosts, int *live_count) { uint64_t host_id, max_gen = 0; int i, rv, live = 0; for (i = 0; i < num_hosts; i++) { host_id = i + 1; if (host_id == token->host_id) continue; if (!test_id_bit(host_id, token->shared_bitmap)) continue; rv = read_mode_block(task, token, host_id, &max_gen); if (rv < 0) { log_errot(token, "clear_dead_shared read_mode_block %llu %d", (unsigned long long)host_id, rv); return rv; } if (host_live(task, token->r.lockspace_name, host_id, max_gen)) { log_token(token, "clear_dead_shared host_id %llu gen %llu alive", (unsigned long long)host_id, (unsigned long long)max_gen); live++; continue; } rv = set_mode_block(task, token, host_id, 0, 0); if (rv < 0) { log_errot(token, "clear_dead_shared host_id %llu set_mode_block %d", (unsigned long long)host_id, rv); return rv; } log_token(token, "clear_dead_shared host_id %llu gen %llu dead and cleared", (unsigned long long)host_id, (unsigned long long)max_gen); } *live_count = live; return rv; } /* return < 0 on error, 1 on success */ static int acquire_disk(struct task *task, struct token *token, uint64_t acquire_lver, int new_num_hosts, struct leader_record *leader) { struct leader_record leader_tmp; int rv; uint32_t flags = 0; if (com.quiet_fail) flags |= PAXOS_ACQUIRE_QUIET_FAIL; if (token->acquire_flags & SANLK_RES_SHARED) flags |= PAXOS_ACQUIRE_SHARED; memset(&leader_tmp, 0, sizeof(leader_tmp)); rv = paxos_lease_acquire(task, token, flags, &leader_tmp, acquire_lver, new_num_hosts); log_token(token, "acquire_disk rv %d lver %llu at %llu", rv, (unsigned long long)leader_tmp.lver, (unsigned long long)leader_tmp.timestamp); memcpy(leader, &leader_tmp, sizeof(struct leader_record)); return rv; /* SANLK_RV */ } /* return < 0 on error, 1 on success */ static int release_disk(struct task *task, struct token *token, struct leader_record *leader) { struct leader_record leader_tmp; int rv; rv = paxos_lease_release(task, token, leader, &leader_tmp); log_token(token, "release_disk rv %d", rv); if (rv < 0) return rv; memcpy(leader, &leader_tmp, sizeof(struct leader_record)); return rv; /* SANLK_OK */ } static int _release_token(struct task *task, struct token *token, int opened, int nodisk) { struct resource *r = token->resource; uint64_t lver; int last_token = 0; int rv; /* We keep r on the resources_rem list while doing the actual release on disk so another acquire for the same resource will see it on the list and fail. we can't have one thread releasing and another acquiring the same resource. While on the rem list, the resource can't be used by anyone. */ pthread_mutex_lock(&resource_mutex); list_del(&token->list); if (list_empty(&r->tokens)) { list_move(&r->list, &resources_rem); last_token = 1; } lver = r->leader.lver; pthread_mutex_unlock(&resource_mutex); if ((r->flags & R_SHARED) && !last_token) { /* will release when final sh token is released */ log_token(token, "release_token more shared"); close_disks(token->disks, token->r.num_disks); return SANLK_OK; } if (!last_token) { /* should never happen */ log_errot(token, "release_token exclusive not last"); close_disks(token->disks, token->r.num_disks); return SANLK_ERROR; } if (!lver) { /* never acquired on disk so no need to release on disk */ close_disks(token->disks, token->r.num_disks); rv = SANLK_OK; goto out; } if (nodisk) { rv = SANLK_OK; goto out; } if (!opened) { rv = open_disks_fd(token->disks, token->r.num_disks); if (rv < 0) { /* it's not terrible if we can't do the disk release */ rv = SANLK_OK; goto out; } } if (r->flags & R_SHARED) { rv = set_mode_block(task, token, token->host_id, 0, 0); } else { rv = release_disk(task, token, &r->leader); } close_disks(token->disks, token->r.num_disks); out: if (rv < 0) log_errot(token, "release_token rv %d flags %x lver %llu o %d n %d", rv, r->flags, (unsigned long long)lver, opened, nodisk); else log_token(token, "release_token flags %x", r->flags); pthread_mutex_lock(&resource_mutex); list_del(&r->list); pthread_mutex_unlock(&resource_mutex); free(r); return rv; } static int release_token_nodisk(struct task *task, struct token *token) { return _release_token(task, token, 0, 1); } static int release_token_opened(struct task *task, struct token *token) { return _release_token(task, token, 1, 0); } int release_token(struct task *task, struct token *token) { return _release_token(task, token, 0, 0); } /* We're releasing a token from the main thread, in which we don't want to block, so we can't do a real release involving disk io. So, pass the release off to the resource_thread. */ void release_token_async(struct token *token) { struct resource *r = token->resource; pthread_mutex_lock(&resource_mutex); list_del(&token->list); if (list_empty(&r->tokens)) { if ((token->flags & T_LS_DEAD) || !r->leader.lver) { /* don't bother trying to release if the lockspace is dead (release will probably fail), or the lease wasn't never acquired */ list_del(&r->list); free(r); } else { r->flags |= R_THREAD_RELEASE; r->release_token_id = token->token_id; resource_thread_work = 1; list_move(&r->list, &resources_rem); pthread_cond_signal(&resource_cond); } } pthread_mutex_unlock(&resource_mutex); } static struct resource *find_resource(struct token *token, struct list_head *head) { struct resource *r; list_for_each_entry(r, head, list) { if (strncmp(r->r.lockspace_name, token->r.lockspace_name, NAME_ID_SIZE)) continue; if (strncmp(r->r.name, token->r.name, NAME_ID_SIZE)) continue; return r; } return NULL; } int lockspace_is_used(struct sanlk_lockspace *ls) { struct resource *r; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } list_for_each_entry(r, &resources_add, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } list_for_each_entry(r, &resources_rem, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } pthread_mutex_unlock(&resource_mutex); return 0; yes: pthread_mutex_unlock(&resource_mutex); return 1; } static void copy_disks(void *dst, void *src, int num_disks) { struct sync_disk *d, *s; int i; d = (struct sync_disk *)dst; s = (struct sync_disk *)src; for (i = 0; i < num_disks; i++) { memcpy(d->path, s->path, SANLK_PATH_LEN); d->offset = s->offset; d->sector_size = s->sector_size; /* fd's are private */ d->fd = -1; d++; s++; } } static struct resource *new_resource(struct token *token) { struct resource *r; int disks_len, r_len; disks_len = token->r.num_disks * sizeof(struct sync_disk); r_len = sizeof(struct resource) + disks_len; r = malloc(r_len); if (!r) return NULL; memset(r, 0, r_len); memcpy(&r->r, &token->r, sizeof(struct sanlk_resource)); /* disks copied after open_disks because open_disks sets sector_size which we want copied */ INIT_LIST_HEAD(&r->tokens); r->host_id = token->host_id; r->host_generation = token->host_generation; if (token->acquire_flags & SANLK_RES_SHARED) { r->flags |= R_SHARED; } else { r->pid = token->pid; if (token->flags & T_RESTRICT_SIGKILL) r->flags |= R_RESTRICT_SIGKILL; } return r; } int acquire_token(struct task *task, struct token *token) { struct leader_record leader; struct resource *r; uint64_t acquire_lver = 0; uint32_t new_num_hosts = 0; int sh_retries = 0; int live_count = 0; int rv; if (token->acquire_flags & SANLK_RES_LVER) acquire_lver = token->acquire_lver; if (token->acquire_flags & SANLK_RES_NUM_HOSTS) new_num_hosts = token->acquire_data32; pthread_mutex_lock(&resource_mutex); r = find_resource(token, &resources_rem); if (r) { if (!com.quiet_fail) log_errot(token, "acquire_token resource being removed"); pthread_mutex_unlock(&resource_mutex); return -EAGAIN; } r = find_resource(token, &resources_add); if (r) { if (!com.quiet_fail) log_errot(token, "acquire_token resource being added"); pthread_mutex_unlock(&resource_mutex); return -EBUSY; } r = find_resource(token, &resources_held); if (r && (token->acquire_flags & SANLK_RES_SHARED) && (r->flags & R_SHARED)) { /* multiple shared holders allowed */ log_token(token, "acquire_token add shared"); copy_disks(&token->r.disks, &r->r.disks, token->r.num_disks); token->resource = r; list_add(&token->list, &r->tokens); pthread_mutex_unlock(&resource_mutex); return SANLK_OK; } if (r) { if (!com.quiet_fail) log_errot(token, "acquire_token resource exists"); pthread_mutex_unlock(&resource_mutex); return -EEXIST; } r = new_resource(token); if (!r) { pthread_mutex_unlock(&resource_mutex); return -ENOMEM; } list_add(&token->list, &r->tokens); list_add(&r->list, &resources_add); token->resource = r; pthread_mutex_unlock(&resource_mutex); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "acquire_token open error %d", rv); release_token_nodisk(task, token); return rv; } copy_disks(&r->r.disks, &token->r.disks, token->r.num_disks); retry: memset(&leader, 0, sizeof(struct leader_record)); rv = acquire_disk(task, token, acquire_lver, new_num_hosts, &leader); if (rv < 0) { if ((token->acquire_flags & SANLK_RES_SHARED) && (leader.flags & LFL_SHORT_HOLD)) { /* * Multiple parallel sh requests can fail because * the lease is briefly held in ex mode. The ex * holder sets SHORT_HOLD in the leader record to * indicate that it's only held for a short time * while acquiring a shared lease. A retry will * probably succeed. */ if (sh_retries++ < com.sh_retries) { int us = get_rand(0, 1000000); log_token(token, "acquire_token sh_retry %d %d", rv, us); usleep(us); goto retry; } rv = SANLK_ACQUIRE_SHRETRY; } release_token_opened(task, token); return rv; } memcpy(&r->leader, &leader, sizeof(struct leader_record)); if (token->acquire_flags & SANLK_RES_SHARED) { rv = set_mode_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED); if (rv < 0) { release_token_opened(task, token); return rv; } else { release_disk(task, token, &leader); /* the token is kept, the paxos lease is released but with shared set */ goto out; } } if (!token->shared_count) goto out; rv = clear_dead_shared(task, token, leader.num_hosts, &live_count); if (rv < 0) { release_token_opened(task, token); return rv; } if (live_count) { /* a live host with a sh lock exists */ release_token_opened(task, token); return -EAGAIN; } out: close_disks(token->disks, token->r.num_disks); pthread_mutex_lock(&resource_mutex); list_move(&r->list, &resources_held); pthread_mutex_unlock(&resource_mutex); return SANLK_OK; } int request_token(struct task *task, struct token *token, uint32_t force_mode, uint64_t *owner_id) { struct leader_record leader; struct request_record req; int rv; memset(&req, 0, sizeof(req)); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "request_token open error %d", rv); return rv; } if (!token->acquire_lver && !force_mode) goto req_read; rv = paxos_lease_leader_read(task, token, &leader, "request"); if (rv < 0) goto out; if (leader.timestamp == LEASE_FREE) { *owner_id = 0; rv = SANLK_OK; goto out; } *owner_id = leader.owner_id; if (leader.lver >= token->acquire_lver) { rv = SANLK_REQUEST_OLD; goto out; } req_read: rv = paxos_lease_request_read(task, token, &req); if (rv < 0) goto out; if (req.magic != REQ_DISK_MAGIC) { rv = SANLK_REQUEST_MAGIC; goto out; } if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) { rv = SANLK_REQUEST_VERSION; goto out; } if (!token->acquire_lver && !force_mode) goto req_write; /* > instead of >= so multiple hosts can request the same version at once and all succeed */ if (req.lver > token->acquire_lver) { rv = SANLK_REQUEST_LVER; goto out; } req_write: req.version = REQ_DISK_VERSION_MAJOR | REQ_DISK_VERSION_MINOR; req.lver = token->acquire_lver; req.force_mode = force_mode; rv = paxos_lease_request_write(task, token, &req); out: close_disks(token->disks, token->r.num_disks); log_debug("request_token rv %d owner %llu lver %llu mode %u", rv, (unsigned long long)*owner_id, (unsigned long long)req.lver, req.force_mode); return rv; } static int examine_token(struct task *task, struct token *token, struct request_record *req_out) { struct request_record req; int rv; memset(&req, 0, sizeof(req)); rv = paxos_lease_request_read(task, token, &req); if (rv < 0) goto out; if (req.magic != REQ_DISK_MAGIC) { rv = SANLK_REQUEST_MAGIC; goto out; } if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) { rv = SANLK_REQUEST_VERSION; goto out; } memcpy(req_out, &req, sizeof(struct request_record)); out: log_debug("examine_token rv %d lver %llu mode %u", rv, (unsigned long long)req.lver, req.force_mode); return rv; } static void do_req_kill_pid(struct token *tt, int pid) { struct resource *r; uint32_t flags; int found = 0; pthread_mutex_lock(&resource_mutex); r = find_resource(tt, &resources_held); if (r && r->pid == pid) { found = 1; flags = r->flags; } pthread_mutex_unlock(&resource_mutex); if (!found) { log_error("req pid %d %.48s:%.48s not found", pid, tt->r.lockspace_name, tt->r.name); return; } log_debug("do_req_kill_pid %d flags %x %.48s:%.48s", pid, flags, tt->r.lockspace_name, tt->r.name); /* TODO: share code with kill_pids() to gradually * escalate from killscript, SIGTERM, SIGKILL */ kill(pid, SIGTERM); if (flags & R_RESTRICT_SIGKILL) return; sleep(1); kill(pid, SIGKILL); } int set_resource_examine(char *space_name, char *res_name) { struct resource *r; int count = 0; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (strncmp(r->r.lockspace_name, space_name, NAME_ID_SIZE)) continue; if (res_name && strncmp(r->r.name, res_name, NAME_ID_SIZE)) continue; r->flags |= R_THREAD_EXAMINE; resource_thread_work = 1; count++; } if (count) pthread_cond_signal(&resource_cond); pthread_mutex_unlock(&resource_mutex); return count; } /* * resource_thread * - releases tokens of pid's that die * - examines request blocks of resources */ static struct resource *find_resource_flag(struct list_head *head, uint32_t flag) { struct resource *r; list_for_each_entry(r, head, list) { if (r->flags & flag) return r; } return NULL; } static void resource_thread_release(struct task *task, struct resource *r, struct token *tt) { int rv; rv = open_disks_fd(tt->disks, tt->r.num_disks); if (rv < 0) { log_errot(tt, "resource_thread_release open error %d", rv); goto out; } if (r->flags & R_SHARED) { set_mode_block(task, tt, tt->host_id, 0, 0); } else { release_disk(task, tt, &r->leader); } close_disks(tt->disks, tt->r.num_disks); out: pthread_mutex_lock(&resource_mutex); list_del(&r->list); pthread_mutex_unlock(&resource_mutex); free(r); } static void resource_thread_examine(struct task *task, struct token *tt, int pid, uint64_t lver) { struct request_record req; int rv; rv = open_disks_fd(tt->disks, tt->r.num_disks); if (rv < 0) { log_errot(tt, "resource_thread_examine open error %d", rv); return; } rv = examine_token(task, tt, &req); close_disks(tt->disks, tt->r.num_disks); if (rv != SANLK_OK) return; if (!req.force_mode || !req.lver) return; if (req.lver <= lver) { log_debug("examine req lver %llu our lver %llu", (unsigned long long)req.lver, (unsigned long long)lver); return; } if (req.force_mode == SANLK_REQ_KILL_PID) { do_req_kill_pid(tt, pid); } else { log_error("req force_mode %u unknown", req.force_mode); } } static void *resource_thread(void *arg GNUC_UNUSED) { struct task task; struct resource *r; struct token *tt = NULL; uint64_t lver; int pid, tt_len; memset(&task, 0, sizeof(struct task)); setup_task_timeouts(&task, main_task.io_timeout_seconds); setup_task_aio(&task, main_task.use_aio, RESOURCE_AIO_CB_SIZE); sprintf(task.name, "%s", "resource"); /* a fake/tmp token struct we copy necessary res info into, because other functions take a token struct arg */ tt_len = sizeof(struct token) + (SANLK_MAX_DISKS * sizeof(struct sync_disk)); tt = malloc(tt_len); if (!tt) { log_error("resource_thread tt malloc error"); goto out; } while (1) { pthread_mutex_lock(&resource_mutex); while (!resource_thread_work) { if (resource_thread_stop) { pthread_mutex_unlock(&resource_mutex); goto out; } pthread_cond_wait(&resource_cond, &resource_mutex); } /* FIXME: it's not nice how we copy a bunch of stuff * from token to r so that we can later copy it back from * r into a temp token. The whole duplication of stuff * between token and r would be nice to clean up. */ memset(tt, 0, tt_len); tt->disks = (struct sync_disk *)&tt->r.disks[0]; r = find_resource_flag(&resources_rem, R_THREAD_RELEASE); if (r) { memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource)); copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks); tt->host_id = r->host_id; tt->host_generation = r->host_generation; tt->token_id = r->release_token_id; r->flags &= ~R_THREAD_RELEASE; pthread_mutex_unlock(&resource_mutex); resource_thread_release(&task, r, tt); continue; } r = find_resource_flag(&resources_held, R_THREAD_EXAMINE); if (r) { /* make copies of things we need because we can't use r once we unlock the mutex since it could be released */ memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource)); copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks); tt->host_id = r->host_id; tt->host_generation = r->host_generation; pid = r->pid; lver = r->leader.lver; r->flags &= ~R_THREAD_EXAMINE; pthread_mutex_unlock(&resource_mutex); resource_thread_examine(&task, tt, pid, lver); continue; } resource_thread_work = 0; pthread_mutex_unlock(&resource_mutex); } out: if (tt) free(tt); close_task_aio(&task); return NULL; } int setup_token_manager(void) { int rv; pthread_mutex_init(&resource_mutex, NULL); pthread_cond_init(&resource_cond, NULL); INIT_LIST_HEAD(&resources_add); INIT_LIST_HEAD(&resources_rem); INIT_LIST_HEAD(&resources_held); rv = pthread_create(&resource_pt, NULL, resource_thread, NULL); if (rv) return -1; return 0; } void close_token_manager(void) { pthread_mutex_lock(&resource_mutex); resource_thread_stop = 1; pthread_cond_signal(&resource_cond); pthread_mutex_unlock(&resource_mutex); pthread_join(resource_pt, NULL); } sanlock-2.2/src/list.h0000644000175100017510000004134411751766670013744 0ustar weberweber/* Copied from linux kernel */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * container_of - cast a member of a structure out to the containing structure * * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) #define LIST_POISON1 ((void *) 0x00100100) #define LIST_POISON2 ((void *) 0x00200200) struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del(entry->prev, entry->next); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add_tail(list, head); } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return head->next == head; } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * __list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * This variant differs from list_for_each() in that it's the * simplest possible list iteration code, no prefetching is done. * Use this for code that knows the list to be very short (empty * or 1 entry) most of the time. */ #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ pos != (head); \ pos = n, n = pos->prev) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_struct within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member), \ n = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_struct within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_entry(pos->member.next, typeof(*pos), member) #endif sanlock-2.2/src/monotime.h0000644000175100017510000000053311751766670014613 0ustar weberweber/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __MONOTIME_H__ #define __MONOTIME_H__ uint64_t monotime(void); #endif sanlock-2.2/src/sanlock_resource.h0000644000175100017510000000545411751766670016334 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_RESOURCE_H__ #define __SANLOCK_RESOURCE_H__ /* * sock > -1, pid is ignored: * process creates registered connection and acquires/releases leases on * that connection for itself * * sock == -1, pid is used: * process asks daemon to acquire/release leases for another separately * registered pid */ /* restrict flags */ #define SANLK_RESTRICT_ALL 0x00000001 #define SANLK_RESTRICT_SIGKILL 0x00000002 #define SANLK_RESTRICT_SIGTERM 0x00000004 /* release flags */ #define SANLK_REL_ALL 0x00000001 /* request flags */ #define SANLK_REQ_KILL_PID 0x00000001 int sanlock_register(void); int sanlock_restrict(int sock, uint32_t flags); int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[], struct sanlk_options *opt_in); int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]); int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, char **res_state); int sanlock_request(uint32_t flags, uint32_t force_mode, struct sanlk_resource *res); int sanlock_examine(uint32_t flags, struct sanlk_lockspace *ls, struct sanlk_resource *res); /* * Functions to convert between string and struct resource formats. * All allocate space for returned data that the caller must free. */ /* * convert from struct sanlk_resource to string with format: * :::[::...]: */ int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret); /* * convert to struct sanlk_resource from string with format: * :::[::...][:] */ int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret); /* * convert from array of struct sanlk_resource * to state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_res_to_str() comment */ int sanlock_args_to_state(int res_count, struct sanlk_resource *res_args[], char **res_state); /* * convert to array of struct sanlk_resource * from state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_str_to_res() comment */ int sanlock_state_to_args(char *res_state, int *res_count, struct sanlk_resource ***res_args); /* * convert to struct sanlk_lockspace from string with format: * ::: */ int sanlock_str_to_lockspace(char *str, struct sanlk_lockspace *ls); #endif sanlock-2.2/src/sanlock.80000644000175100017510000004100011751766670014330 0ustar weberweber.TH SANLOCK 8 2011-08-05 .SH NAME sanlock \- shared storage lock manager .SH SYNOPSIS .B sanlock [COMMAND] [ACTION] ... .SH DESCRIPTION The sanlock daemon manages leases for applications running on a cluster of hosts with shared storage. All lease management and coordination is done through reading and writing blocks on the shared storage. Two types of leases are used, each based on a different algorithm: "delta leases" are slow to acquire and require regular i/o to shared storage. A delta lease exists in a single sector of storage. Acquiring a delta lease involves reads and writes to that sector separated by specific delays. Once acquired, a lease must be renewed by updating a timestamp in the sector regularly. sanlock uses a delta lease internally to hold a lease on a host_id. host_id leases prevent two hosts from using the same host_id and provide basic host liveness information based on the renewals. "paxos leases" are generally fast to acquire and sanlock makes them available to applications as general purpose resource leases. A paxos lease exists in 1MB of shared storage (8MB for 4k sectors). Acquiring a paxos lease involves reads and writes to max_hosts (2000) sectors in a specific sequence specified by the Disk Paxos algorithm. paxos leases use host_id's internally to indicate the owner of the lease, and the algorithm fails if different hosts use the same host_id. So, delta leases provide the unique host_id's used in paxos leases. paxos leases also refer to delta leases to check if a host_id is alive. Before sanlock can be used, the user must assign each host a host_id, which is a number between 1 and 2000. Two hosts should not be given the same host_id (even though delta leases attempt to detect this mistake.) sanlock views a pool of storage as a "lockspace". Each distinct pool of storage, e.g. from different sources, would typically be defined as a separate lockspace, with a unique lockspace name. Part of this storage space must be reserved and initialized for sanlock to store delta leases. Each host that wants to use the lockspace must first acquire a delta lease on its host_id number within the lockspace. (See the add_lockspace action/api.) The space required for 2000 delta leases in the lockspace (for 2000 possible host_id's) is 1MB (8MB for 4k sectors). (This is the same size required for a single paxos lease.) More storage space must be reserved and initialized for paxos leases, according to the needs of the applications using sanlock. The following steps illustrate these concepts using the command line. Applications may choose to do these same steps through libsanlock. 1. Create storage pools and reserve and initialize host_id leases .br two different LUNs on a SAN: /dev/sdb, /dev/sdc .br # vgcreate pool1 /dev/sdb .br # vgcreate pool2 /dev/sdc .br # lvcreate -n hostid_leases -L 1MB pool1 .br # lvcreate -n hostid_leases -L 1MB pool2 .br # sanlock direct init -s LS1:0:/dev/pool1/hostid_leases:0 .br # sanlock direct init -s LS2:0:/dev/pool2/hostid_leases:0 .br 2. Start the sanlock daemon on each host .br # sanlock daemon .br 3. Add each lockspace to be used .br host1: .br # sanlock client add_lockspace -s LS1:1:/dev/pool1/hostid_leases:0 .br # sanlock client add_lockspace -s LS2:1:/dev/pool2/hostid_leases:0 .br host2: .br # sanlock client add_lockspace -s LS1:2:/dev/pool1/hostid_leases:0 .br # sanlock client add_lockspace -s LS2:2:/dev/pool2/hostid_leases:0 .br 4. Applications can now reserve/initialize space for resource leases, and then acquire the leases as they need to access the resources. The resource leases that are created and how they are used depends on the application. For example, say application A, running on host1 and host2, needs to synchronize access to data it stores on /dev/pool1/Adata. A could use a resource lease as follows: 5. Reserve and initialize a single resource lease for Adata .br # lvcreate -n Adata_lease -L 1MB pool1 .br # sanlock direct init -r LS1:Adata:/dev/pool1/Adata_lease:0 .br 6. Acquire the lease from the app using libsanlock (see sanlock_register, sanlock_acquire). If the app is already running as pid 123, and has registered with the sanlock daemon, the lease can be added for it manually. .br # sanlock client acquire -r LS1:Adata:/dev/pool1/Adata_lease:0 -p 123 .br .B offsets offsets must be 1MB aligned for disks with 512 byte sectors, and 8MB aligned for disks with 4096 byte sectors. offsets may be used to place leases on the same device rather than using separate devices and offset 0 as shown in examples above, e.g. these commands above: .br # sanlock direct init -s LS1:0:/dev/pool1/hostid_leases:0 .br # sanlock direct init -r LS1:Adata:/dev/pool1/Adata_lease:0 .br could be replaced by: .br .br # sanlock direct init -s LS1:0:/dev/pool1/leases:0 .br # sanlock direct init -r LS1:Adata:/dev/pool1/leases:1048576 .B failures If a process holding resource leases fails or exits without releasing its leases, sanlock will release the leases for it automatically. If the sanlock daemon cannot renew a lockspace host_id for a specific period of time (usually because storage access is lost), sanlock will kill any process holding a resource lease within the lockspace. If the sanlock daemon crashes or gets stuck, it will no longer renew the expiry time of its per-host_id connections to the wdmd daemon, and the watchdog device will reset the host. .B watchdog sanlock uses the .BR wdmd (8) daemon to access /dev/watchdog. A separate wdmd connection is maintained with wdmd for each host_id being renewed. Each host_id connection has an expiry time for some seconds in the future. After each successful host_id renewal, sanlock updates the associated expiry time in wdmd. If wdmd finds any connection expired, it will not pet /dev/watchdog. After enough successive expired/failed checks, the watchdog device will fire and reset the host. After a number of failed attempts to renew a host_id, sanlock kills any process using that lockspace. Once all those processes have exited, sanlock will unregister the associated wdmd connection. wdmd will no longer find the expired connection, and will resume petting /dev/watchdog (assuming it finds no other failed/expired tests.) If the killed processes did not exit quickly enough, the expired wdmd connection will not be unregistered, and /dev/watchdog will reset the host. Based on these known timeout values, sanlock on another host can calculate, based on the last host_id renewal, when the failed host will have been reset by its watchdog (or killed all the necessary processes). If the sanlock daemon itself fails, crashes, get stuck, it will no longer update the expiry time for its host_id connections to wdmd, which will also lead to the watchdog resetting the host. .B safety sanlock leases are meant to guarantee that two process on two hosts are never allowed to hold the same resource lease at once. If they were, the resource being protected may be corrupted. There are three levels of protection built into sanlock itself: 1. The paxos leases and delta leases themselves. 2. If the leases cannot function because storage access is lost (host_id's cannot be renewed), the sanlock daemon kills any pids using resource leases in the lockspace. 3. If the pids do not exit after being killed, or if the sanlock daemon fails, the watchdog device resets the host. .SH OPTIONS .P COMMAND can be one of three primary top level choices .P .BR "sanlock daemon" " start daemon" .br .BR "sanlock client" " send request to daemon (default command if none given)" .br .BR "sanlock direct" " access storage directly (no coordination with daemon)" .BR "sanlock daemon" " [options]" .BR -D " " no fork and print all logging to stderr .BR -Q " 0|1" quiet error messages for common lock contention .BR -R " 0|1" renewal debugging, log debug info for each renewal .BI -L " pri" write logging at priority level and up to logfile (-1 none) .BI -S " pri" write logging at priority level and up to syslog (-1 none) .BI -U " uid" user id .BI -G " gid" group id .BI -t " num" max worker threads .BR -w " 0|1" use watchdog through wdmd .BR -h " 0|1" use high priority features (realtime scheduling, mlockall) .BR \-a " 0|1" use async i/o .BI -o " sec" io timeout in seconds .B "sanlock client" .I action [options] .B sanlock client status Print processes, lockspaces, and resources being manged by the sanlock daemon. Add -D to show extra internal daemon status for debugging. Add -o p to show resources by pid, or -o s to show resources by lockspace. .BR "sanlock client host_status -s" " LOCKSPACE" Print state of host_id delta leases read during the last renewal. Only lockspace_name is used from the LOCKSPACE argument. Add -D to show extra internal daemon status for debugging. .B sanlock client log_dump Print the sanlock daemon internal debug log. .B sanlock client shutdown Ask the sanlock daemon to exit. Without the force option (-f 0), the command will be ignored if any lockspaces exist. With the force option (-f 1), any registered processes will be killed, their resource leases released, and lockspaces removed. .BR "sanlock client init -s" " LOCKSPACE" .br .BR "sanlock client init -r" " RESOURCE" Tell the sanlock daemon to initialize storage for lease areas. (See sanlock direct init.) .BR "sanlock client align -s" " LOCKSPACE" Tell the sanlock daemon to report the required lease alignment for a storage path. Only path is used from the LOCKSPACE argument. .BR "sanlock client add_lockspace -s" " LOCKSPACE" Tell the sanlock daemon to acquire the specified host_id in the lockspace. This will allow resources to be acquired in the lockspace. .BR "sanlock client inq_lockspace -s" " LOCKSPACE" Ask to the sanlock daemon weather the lockspace is acquired or not. .BR "sanlock client rem_lockspace -s" " LOCKSPACE" Tell the sanlock daemon to release the specified host_id in the lockspace. Any processes holding resource leases in this lockspace will be killed, and the resource leases not released. .BR "sanlock client command -r" " RESOURCE " \ \fB-c\fP " " \fIpath\fP " " \fIargs\fP Register with the sanlock daemon, acquire the specified resource lease, and exec the command at path with args. When the command exits, the sanlock daemon will release the lease. -c must be the final option. .BR "sanlock client acquire -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP .br .BR "sanlock client release -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP Tell the sanlock daemon to acquire or release the specified resource lease for the given pid. The pid must be registered with the sanlock daemon. acquire can optionally take a versioned RESOURCE string RESOURCE:lver, where lver is the version of the lease that must be acquired, or fail. .BI "sanlock client inquire -p" " pid" Print the resource leases held the given pid. The format is a versioned RESOURCE string "RESOURCE:lver" where lver is the version of the lease held. .BR "sanlock client request -r" " RESOURCE " \ \fB-f\fP " " \fIforce_mode\fP Request the owner of a resource do something specified by force_mode. A versioned RESOURCE:lver string must be used with a greater version than is presently held. Zero lver and force_mode clears the request. .BR "sanlock client examine -r" " RESOURCE" Examine the request record for the currently held resource lease and carry out the action specified by the requested force_mode. .BR "sanlock client examine -s" " LOCKSPACE" Examine requests for all resource leases currently held in the named lockspace. Only lockspace_name is used from the LOCKSPACE argument. .B "sanlock direct" .I action [options] .BR \-a " 0|1" use async i/o .BI -o " sec" io timeout in seconds .BR "sanlock direct init -s" " LOCKSPACE" .br .BR "sanlock direct init -r" " RESOURCE" Initialize storage for 2000 host_id (delta) leases for the given lockspace, or initialize storage for one resource (paxos) lease. Both options require 1MB of space. The host_id in the LOCKSPACE string is not relevant to initialization, so the value is ignored. (The default of 2000 host_ids can be changed for special cases using the -n num_hosts and -m max_hosts options.) .BR "sanlock direct read_leader -s" " LOCKSPACE" .br .BR "sanlock direct read_leader -r" " RESOURCE" Read a leader record from disk and print the fields. The leader record is the single sector of a delta lease, or the first sector of a paxos lease. .BR "sanlock direct read_id -s" " LOCKSPACE" .br .BR "sanlock direct live_id -s" " LOCKSPACE" read_id reads a host_id and prints the owner. live_id reads a host_id once a second until it the timestamp or owner change (prints live 1), or until host_dead_seconds (prints live 0). (host_dead_seconds is derived from the io_timeout option. The live 0|1 conclusion will not match the sanlock daemon's conclusion unless the configured timeouts match.) ./" .P ./" .BR "sanlock direct acquire_id -s" " LOCKSPACE" ./" .br ./" .BR "sanlock direct renew_id -s" " LOCKSPACE" ./" .br ./" .BR "sanlock direct release_id -s" " LOCKSPACE" ./" ./" Acquire, renew, or release a host_id directly to disk, independent from ./" the sanlock daemon. Not for general use. This should only be used for ./" testing or for manual recovery in an emergency. ./" ./" .P ./" .BR "sanlock direct acquire -r" " RESOURCE " \ ./" \fB-i\fP " " \fInum\fP " " \fB-g\fP " " \fInum\fP ./" .br ./" .BR "sanlock direct release -r" " RESOURCE " \ ./" \fB-i\fP " " \fInum\fP " " \fB-g\fP " " \fInum\fP ./" ./" Not supported. Not for general use. ./" .BI "sanlock direct dump" " path" \ \fR[\fP\fB:\fP\fIoffset\fP\fR]\fP Read disk sectors and print leader records for delta or paxos leases. Add -f 1 to print the request record values for paxos leases, and host_ids set in delta lease bitmaps. .SS LOCKSPACE option string .BR \-s " " \fIlockspace_name\fP:\fIhost_id\fP:\fIpath\fP:\fIoffset\fP .P .IR lockspace_name " name of lockspace" .br .IR host_id " local host identifier in lockspace" .br .IR path " path to storage reserved for leases" .br .IR offset " offset on path (bytes)" .br .SS RESOURCE option string .BR \-r " " \fIlockspace_name\fP:\fIresource_name\fP:\fIpath\fP:\fIoffset\fP .P .IR lockspace_name " name of lockspace" .br .IR resource_name " name of resource" .br .IR path " path to storage reserved for leases" .br .IR offset " offset on path (bytes)" .SS RESOURCE option string with version .BR \-r " " \fIlockspace_name\fP:\fIresource_name\fP:\fIpath\fP:\fIoffset\fP:\fIlver\fP .P .IR lver " leader version or SH for shared lease" .SS Defaults .B sanlock help shows the default values for the options above. .B sanlock version shows the build version. .SH USAGE .SS Request/Examine The first part of making a request for a resource is writing the request record of the resource (the sector following the leader record). To make a successful request: .IP \(bu 3 RESOURCE:lver must be greater than the lver presently held by the other host. This implies the leader record must be read to discover the lver, prior to making a request. .IP \(bu 3 RESOURCE:lver must be greater than or equal to the lver presently written to the request record. Two hosts may write a new request at the same time for the same lver, in which case both would succeed, but the force_mode from the last would win. .IP \(bu 3 The force_mode must be greater than zero. .IP \(bu 3 To unconditionally clear the request record (set both lver and force_mode to 0), make request with RESOURCE:0 and force_mode 0. .PP The owner of the requested resource will not know of the request unless it is explicitly told to examine its resources via the "examine" api/command, or otherwise notfied. The second part of making a request is notifying the resource lease owner that it should examine the request records of its resource leases. The notification will cause the lease owner to automatically run the equivalent of "sanlock client examine -s LOCKSPACE" for the lockspace of the requested resource. The notification is made using a bitmap in each host_id delta lease. Each bit represents each of the possible host_ids (1-2000). If host A wants to notify host B to examine its resources, A sets the bit in its own bitmap that corresponds to the host_id of B. When B next renews its delta lease, it reads the delta leases for all hosts and checks each bitmap to see if its own host_id has been set. It finds the bit for its own host_id set in A's bitmap, and examines its resource request records. (The bit remains set in A's bitmap for request_finish_seconds.) \fIforce_mode\fP determines the action the resource lease owner should take: \fB1\fP (KILL_PID): kill the process holding the resource lease. When the process has exited, the resource lease will be released, and can then be acquired by anyone. .SH SEE ALSO .BR wdmd (8) sanlock-2.2/src/direct.c0000644000175100017510000003177411751766670014244 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "log.h" #include "resource.h" #include "direct.h" #include "paxos_lease.h" #include "delta_lease.h" #include "mode_block.h" /* * cli: sanlock direct init * cli: sanlock direct read_leader * cli: sanlock direct acquire * cli: sanlock direct release * lib: sanlock_direct_init() * * direct.c: * direct_init() * direct_read_leader() * direct_acquire() * direct_release() * do_paxos_action() * paxos_lease.c: * paxos_lease_init() * paxos_lease_leader_read() * paxos_lease_acquire() * paxos_lease_release() * * cli: sanlock direct init * cli: sanlock direct read_leader * cli: sanlock direct acquire_id * cli: sanlock direct release_id * cli: sanlock direct renew_id * cli: sanlock direct read_id * cli: sanlock direct live_id * lib: sanlock_direct_read_id() * lib: sanlock_direct_live_id() * lib: sanlock_direct_init() * * direct.c: * direct_init() * direct_read_leader() * direct_acquire_id() * direct_release_id() * direct_renew_id() * direct_read_id() * direct_live_id() * do_delta_action() * delta_lease.c: * delta_lease_init() * delta_lease_leader_read() * delta_lease_acquire() * delta_lease_release() * delta_lease_renew() */ static int do_paxos_action(int action, struct task *task, struct sanlk_resource *res, int max_hosts, int num_hosts, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_ret) { struct token *token; struct leader_record leader; int disks_len, token_len; int j, rv = 0; disks_len = res->num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) return -ENOMEM; memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; token->r.num_disks = res->num_disks; memcpy(token->r.lockspace_name, res->lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res->name, SANLK_NAME_LEN); /* WARNING sync_disk == sanlk_disk */ memcpy(token->disks, &res->disks, disks_len); for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { free(token); return rv; } switch (action) { case ACT_DIRECT_INIT: rv = paxos_lease_init(task, token, num_hosts, max_hosts); break; case ACT_ACQUIRE: token->host_id = local_host_id; token->host_generation = local_host_generation; rv = paxos_lease_acquire(task, token, 0, leader_ret, 0, num_hosts); break; case ACT_RELEASE: rv = paxos_lease_leader_read(task, token, &leader, "direct_release"); if (rv < 0) break; rv = paxos_lease_release(task, token, &leader, leader_ret); break; case ACT_READ_LEADER: rv = paxos_lease_leader_read(task, token, &leader, "direct_read_leader"); break; } close_disks(token->disks, token->r.num_disks); free(token); if (rv == SANLK_OK) rv = 0; if (leader_ret) memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* * sanlock direct acquire -i -g -r RESOURCE * sanlock direct release -r RESOURCE */ int direct_acquire(struct task *task, struct sanlk_resource *res, int num_hosts, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_ret) { return do_paxos_action(ACT_ACQUIRE, task, res, -1, num_hosts, local_host_id, local_host_generation, leader_ret); } int direct_release(struct task *task, struct sanlk_resource *res, struct leader_record *leader_ret) { return do_paxos_action(ACT_RELEASE, task, res, -1, -1, 0, 0, leader_ret); } static int do_delta_action(int action, struct task *task, struct sanlk_lockspace *ls, int max_hosts, char *our_host_name, struct leader_record *leader_ret) { struct leader_record leader; struct sync_disk sd; struct space space; char bitmap[HOSTID_BITMAP_SIZE]; int read_result, rv; memset(bitmap, 0, sizeof(bitmap)); /* for log_space in delta functions */ memset(&space, 0, sizeof(space)); if (!ls->host_id_disk.path[0]) return -ENODEV; memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &ls->host_id_disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) return -ENODEV; switch (action) { case ACT_DIRECT_INIT: rv = delta_lease_init(task, &sd, ls->name, max_hosts); break; case ACT_ACQUIRE_ID: rv = delta_lease_acquire(task, &space, &sd, ls->name, our_host_name, ls->host_id, &leader); break; case ACT_RENEW_ID: rv = delta_lease_leader_read(task, &sd, ls->name, ls->host_id, &leader, "direct_renew"); if (rv < 0) return rv; rv = delta_lease_renew(task, &space, &sd, ls->name, bitmap, -1, &read_result, &leader, &leader); break; case ACT_RELEASE_ID: rv = delta_lease_leader_read(task, &sd, ls->name, ls->host_id, &leader, "direct_release"); if (rv < 0) return rv; rv = delta_lease_release(task, &space, &sd, ls->name, &leader, &leader); break; case ACT_READ_ID: case ACT_READ_LEADER: rv = delta_lease_leader_read(task, &sd, ls->name, ls->host_id, &leader, "direct_read"); break; } close_disks(&sd, 1); if (rv == SANLK_OK) rv = 0; if (leader_ret) memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* * sanlock direct acquire_id|release_id|renew_id -s LOCKSPACE * * should be the equivalent of what the daemon would do for * sanlock client add_lockspace|rem_lockspace -s LOCKSPACE */ int direct_acquire_id(struct task *task, struct sanlk_lockspace *ls, char *our_host_name) { return do_delta_action(ACT_ACQUIRE_ID, task, ls, -1, our_host_name, NULL); } int direct_release_id(struct task *task, struct sanlk_lockspace *ls) { return do_delta_action(ACT_RELEASE_ID, task, ls, -1, NULL, NULL); } int direct_renew_id(struct task *task, struct sanlk_lockspace *ls) { return do_delta_action(ACT_RENEW_ID, task, ls, -1, NULL, NULL); } int direct_read_id(struct task *task, struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation) { struct leader_record leader; int rv; memset(&leader, 0, sizeof(struct leader_record)); rv = do_delta_action(ACT_READ_ID, task, ls, -1, NULL, &leader); *timestamp = leader.timestamp; *owner_id = leader.owner_id; *owner_generation = leader.owner_generation; return rv; } int direct_live_id(struct task *task, struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int *live) { struct leader_record leader_begin; struct leader_record leader; time_t start; int rv; rv = do_delta_action(ACT_READ_ID, task, ls, -1, NULL, &leader_begin); if (rv < 0) return rv; start = monotime(); while (1) { sleep(1); rv = do_delta_action(ACT_READ_ID, task, ls, -1, NULL, &leader); if (rv < 0) return rv; if (leader.timestamp != leader_begin.timestamp) { *live = 1; break; } if (leader.owner_id != leader_begin.owner_id) { *live = 2; break; } if (leader.owner_generation != leader_begin.owner_generation) { *live = 3; break; } if (monotime() - start > task->host_dead_seconds) { *live = 0; break; } } *timestamp = leader.timestamp; *owner_id = leader.owner_id; *owner_generation = leader.owner_generation; return 0; } int direct_align(struct sync_disk *disk) { if (disk->sector_size == 512) return 1024 * 1024; else if (disk->sector_size == 4096) return 8 * 1024 * 1024; else return -EINVAL; } /* * sanlock direct init [-s LOCKSPACE] [-r RESOURCE] * * Note: host_id not used for init, whatever is given in LOCKSPACE * is ignored */ int direct_init(struct task *task, struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts) { int rv = -1; if (ls && ls->host_id_disk.path[0]) { rv = do_delta_action(ACT_DIRECT_INIT, task, ls, max_hosts, NULL, NULL); } else if (res) { if (!res->num_disks) return -ENODEV; if (!res->disks[0].path[0]) return -ENODEV; rv = do_paxos_action(ACT_DIRECT_INIT, task, res, max_hosts, num_hosts, 0, 0, NULL); } return rv; } int direct_read_leader(struct task *task, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader_ret) { int rv = -1; if (ls && ls->host_id_disk.path[0]) rv = do_delta_action(ACT_READ_LEADER, task, ls, -1, NULL, leader_ret); else if (res) rv = do_paxos_action(ACT_READ_LEADER, task, res, -1, -1, 0, 0, leader_ret); return rv; } int test_id_bit(int host_id, char *bitmap); int direct_dump(struct task *task, char *dump_path, int force_mode) { char *data, *bitmap; char *colon, *off_str; struct leader_record *lr; struct request_record *rr; struct sync_disk sd; char sname[NAME_ID_SIZE+1]; char rname[NAME_ID_SIZE+1]; uint64_t sector_nr; int sector_count, datalen, align_size; int i, rv, b; memset(&sd, 0, sizeof(struct sync_disk)); colon = strstr(dump_path, ":"); if (colon) { off_str = colon + 1; *colon = '\0'; sd.offset = atoll(off_str); } strncpy(sd.path, dump_path, SANLK_PATH_LEN); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) return -ENODEV; rv = direct_align(&sd); if (rv < 0) goto out_close; align_size = rv; datalen = align_size; sector_count = align_size / sd.sector_size; data = malloc(datalen); if (!data) { rv = -ENOMEM; goto out_close; } printf("%8s %36s %48s %10s %4s %4s %s", "offset", "lockspace", "resource", "timestamp", "own", "gen", "lver"); if (force_mode) printf("/req/mode"); printf("\n"); sector_nr = 0; while (1) { memset(sname, 0, sizeof(rname)); memset(rname, 0, sizeof(rname)); memset(data, 0, sd.sector_size); rv = read_sectors(&sd, sector_nr, sector_count, data, datalen, task, "dump"); lr = (struct leader_record *)data; if (lr->magic == DELTA_DISK_MAGIC) { for (i = 0; i < sector_count; i++) { lr = (struct leader_record *)(data + (i * sd.sector_size)); if (!lr->magic) continue; /* has never been acquired, don't print */ if (!lr->owner_id && !lr->owner_generation) continue; strncpy(sname, lr->space_name, NAME_ID_SIZE); strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu", (unsigned long long)((sector_nr + i) * sd.sector_size), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation); if (force_mode) { bitmap = (char *)lr + LEADER_RECORD_MAX; for (b = 0; b < DEFAULT_MAX_HOSTS; b++) { if (test_id_bit(b+1, bitmap)) printf(" %d", b+1); } } printf("\n"); } } else if (lr->magic == PAXOS_DISK_MAGIC) { strncpy(sname, lr->space_name, NAME_ID_SIZE); strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu %llu", (unsigned long long)(sector_nr * sd.sector_size), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); if (force_mode) { rr = (struct request_record *)(data + sd.sector_size); printf("/%llu/%u", (unsigned long long)rr->lver, rr->force_mode); } printf("\n"); for (i = 0; i < lr->num_hosts; i++) { char *pd = data + ((2 + i) * sd.sector_size); struct mode_block *mb = (struct mode_block *)(pd + MBLOCK_OFFSET); if (!(mb->flags & MBLOCK_SHARED)) continue; printf(" "); printf("%04u %04llu SH\n", i+1, (unsigned long long)mb->generation); } } else { break; } sector_nr += sector_count; } rv = 0; free(data); out_close: close_disks(&sd, 1); return rv; } sanlock-2.2/src/sanlock.h0000644000175100017510000000447611751766670014430 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_H__ #define __SANLOCK_H__ /* pid can own this many resources at once */ #define SANLK_MAX_RESOURCES 8 /* max resource name length */ #define SANLK_NAME_LEN 48 /* max disk path length, includes terminating \0 byte */ #define SANLK_PATH_LEN 1024 /* max disks in a single lease */ #define SANLK_MAX_DISKS 4 /* * max length of a sanlk_resource in string format * :::[::...]: * 48 SANLK_NAME_LEN * + 1 colon * + 48 SANLK_NAME_LEN * + 1 colon * + 4184 (4 MAX_DISKS * (1024 SANLK_PATH_LEN + 1 colon + 20 offset + 1 colon)) * + 20 lver * ------ * 4302 */ #define SANLK_MAX_RES_STR 4400 /* TODO: add more padding to sanlk_disk so we can extend sync_disk later without changing abi */ struct sanlk_disk { char path[SANLK_PATH_LEN]; /* must include terminating \0 */ uint64_t offset; uint32_t pad1; uint32_t pad2; }; #define SANLK_RES_LVER 0x1 /* lver field is set */ #define SANLK_RES_NUM_HOSTS 0x2 /* data32 field is new num_hosts */ #define SANLK_RES_SHARED 0x4 struct sanlk_resource { char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ char name[SANLK_NAME_LEN]; /* terminating \0 not required */ uint64_t lver; /* use with SANLK_RES_LVER */ uint64_t data64; /* per-resource command-specific data */ uint32_t data32; /* per-resource command-specific data */ uint32_t unused; uint32_t flags; /* SANLK_RES_ */ uint32_t num_disks; /* followed by num_disks sanlk_disk structs */ struct sanlk_disk disks[0]; }; /* command-specific command options (can include per resource data, but that requires the extra work of segmenting it by resource name) */ struct sanlk_options { char owner_name[SANLK_NAME_LEN]; /* optional user friendly name */ uint32_t flags; uint32_t len; /* followed by len bytes (migration input will use this) */ char str[0]; }; struct sanlk_lockspace { char name[SANLK_NAME_LEN]; uint64_t host_id; uint32_t flags; struct sanlk_disk host_id_disk; }; #endif sanlock-2.2/src/cmd.c0000644000175100017510000012155211751766670013527 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_admin.h" #include "sanlock_sock.h" #include "diskio.h" #include "log.h" #include "paxos_lease.h" #include "delta_lease.h" #include "lockspace.h" #include "resource.h" #include "direct.h" #include "task.h" #include "cmd.h" /* from main.c */ void client_resume(int ci); void client_free(int ci); void client_recv_all(int ci, struct sm_header *h_recv, int pos); void client_pid_dead(int ci); void send_result(int fd, struct sm_header *h_recv, int result); static uint32_t token_id_counter = 1; static void release_cl_tokens(struct task *task, struct client *cl) { struct token *token; int j; for (j = 0; j < SANLK_MAX_RESOURCES; j++) { token = cl->tokens[j]; if (!token) continue; release_token(task, token); free(token); } } static void release_new_tokens(struct task *task, struct token *new_tokens[], int alloc_count, int acquire_count) { int i; for (i = 0; i < acquire_count; i++) release_token(task, new_tokens[i]); for (i = 0; i < alloc_count; i++) free(new_tokens[i]); } /* called with both spaces_mutex and cl->mutex held */ static int check_new_tokens_space(struct client *cl, struct token *new_tokens[], int new_tokens_count) { struct space space; struct token *token; int i, rv, empty_slots = 0; for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (!cl->tokens[i]) empty_slots++; } if (empty_slots < new_tokens_count) { /* shouldn't ever happen */ return -ENOENT; } /* space may have failed while new tokens were being acquired */ for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = _lockspace_info(token->r.lockspace_name, &space); if (!rv && !space.killing_pids && space.host_id == token->host_id) continue; return -ENOSPC; } return 0; } static void cmd_acquire(struct task *task, struct cmd_args *ca) { struct client *cl; struct token *token = NULL; struct token *new_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; struct sanlk_options opt; struct space space; char *opt_str; int token_len, disks_len; int fd, rv, i, j, empty_slots, lvl; int alloc_count = 0, acquire_count = 0; int pos = 0, pid_dead = 0; int new_tokens_count; int recv_done = 0; int result = 0; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; new_tokens_count = ca->header.data; log_debug("cmd_acquire %d,%d,%d ci_in %d fd %d count %d", cl_ci, cl_fd, cl_pid, ca->ci_in, fd, new_tokens_count); if (new_tokens_count > SANLK_MAX_RESOURCES) { log_error("cmd_acquire %d,%d,%d new %d max %d", cl_ci, cl_fd, cl_pid, new_tokens_count, SANLK_MAX_RESOURCES); result = -E2BIG; goto done; } pthread_mutex_lock(&cl->mutex); if (cl->pid_dead) { result = -ESTALE; pthread_mutex_unlock(&cl->mutex); goto done; } empty_slots = 0; for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (!cl->tokens[i]) empty_slots++; } pthread_mutex_unlock(&cl->mutex); if (empty_slots < new_tokens_count) { log_error("cmd_acquire %d,%d,%d new %d slots %d", cl_ci, cl_fd, cl_pid, new_tokens_count, empty_slots); result = -ENOENT; goto done; } /* * read resource input and allocate tokens for each */ for (i = 0; i < new_tokens_count; i++) { /* * receive sanlk_resource, create token for it */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_acquire %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto done; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto done; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto done; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); if (res.flags & SANLK_RES_SHARED) token->r.flags |= SANLK_RES_SHARED; token->acquire_lver = res.lver; token->acquire_data64 = res.data64; token->acquire_data32 = res.data32; token->acquire_flags = res.flags; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv > 0) pos += rv; if (rv != disks_len) { log_error("cmd_acquire %d,%d,%d recv disks %d %d", cl_ci, cl_fd, cl_pid, rv, errno); free(token); result = -ENOTCONN; goto done; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } token->token_id = token_id_counter++; new_tokens[i] = token; alloc_count++; } rv = recv(fd, &opt, sizeof(struct sanlk_options), MSG_WAITALL); if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_options)) { log_error("cmd_acquire %d,%d,%d recv opt %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto done; } strncpy(cl->owner_name, opt.owner_name, SANLK_NAME_LEN); if (opt.len) { opt_str = malloc(opt.len); if (!opt_str) { result = -ENOMEM; goto done; } rv = recv(fd, opt_str, opt.len, MSG_WAITALL); if (rv > 0) pos += rv; if (rv != opt.len) { log_error("cmd_acquire %d,%d,%d recv str %d %d", cl_ci, cl_fd, cl_pid, rv, errno); free(opt_str); result = -ENOTCONN; goto done; } } /* TODO: warn if header.length != sizeof(header) + pos ? */ recv_done = 1; /* * all command input has been received, start doing the acquire */ for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = lockspace_info(token->r.lockspace_name, &space); if (rv < 0 || space.killing_pids) { log_errot(token, "cmd_acquire %d,%d,%d invalid lockspace " "found %d failed %d name %.48s", cl_ci, cl_fd, cl_pid, rv, space.killing_pids, token->r.lockspace_name); result = -ENOSPC; goto done; } token->host_id = space.host_id; token->host_generation = space.host_generation; token->pid = cl_pid; if (cl->restrict & SANLK_RESTRICT_SIGKILL) token->flags |= T_RESTRICT_SIGKILL; /* save a record of what this token_id is for later debugging */ log_level(space.space_id, token->token_id, NULL, LOG_WARNING, "resource %.48s:%.48s:%.256s:%llu%s for %d,%d,%d", token->r.lockspace_name, token->r.name, token->r.disks[0].path, (unsigned long long)token->r.disks[0].offset, (token->acquire_flags & SANLK_RES_SHARED) ? ":SH" : "", cl_ci, cl_fd, cl_pid); } for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = acquire_token(task, token); if (rv < 0) { switch (rv) { case -EEXIST: case -EAGAIN: case -EBUSY: lvl = LOG_DEBUG; break; case SANLK_ACQUIRE_IDLIVE: lvl = com.quiet_fail ? LOG_DEBUG : LOG_ERR; break; default: lvl = LOG_ERR; } log_level(0, token->token_id, NULL, lvl, "cmd_acquire %d,%d,%d acquire_token %d", cl_ci, cl_fd, cl_pid, rv); result = rv; goto done; } acquire_count++; } /* * Success acquiring the leases: * lock mutex, * 1. if pid is live, move new_tokens to cl->tokens, clear cmd_active, unlock mutex * 2. if pid is dead, clear cmd_active, unlock mutex, release new_tokens, release cl->tokens, client_free * * Failure acquiring the leases: * lock mutex, * 3. if pid is live, clear cmd_active, unlock mutex, release new_tokens * 4. if pid is dead, clear cmd_active, unlock mutex, release new_tokens, release cl->tokens, client_free * * client_pid_dead() won't touch cl->tokens while cmd_active is set. * As soon as we clear cmd_active and unlock the mutex, client_pid_dead * will attempt to clear cl->tokens itself. If we find client_pid_dead * has already happened when we look at pid_dead, then we know that it * won't be called again, and it's our responsibility to clear cl->tokens * and call client_free. */ /* * We hold both space_mutex and cl->mutex at once to create the crucial * linkage between the client pid and the lockspace. Once we release * these two mutexes, if the lockspace fails, this pid will be killed. * Prior to inserting the new_tokens into the client, if the lockspace * fails, kill_pids/client_using_pid would not find this pid (assuming * it doesn't already hold other tokens using the lockspace). If * the lockspace failed while we were acquring the tokens, kill_pids * has already run and not found us, so we must revert what we've done * in acquire. * * Warning: * We could deadlock if we hold cl->mutex and take spaces_mutex, * because all_pids_dead() and kill_pids() hold spaces_mutex and take * cl->mutex. So, lock spaces_mutex first, then cl->mutex to avoid the * deadlock. * * Other approaches: * A solution may be to record in each sp all the pids/cis using it * prior to starting the acquire. Then we would not need to do this * check here to see if the lockspace has been killed (if it was, the * pid for this ci would have been killed in kill_pids), and * all_pids_dead() and kill_pids() would not need to go through each cl * and each cl->token to check if it's using the sp (it would know by * just looking at sp->pids[] and killing each). */ done: pthread_mutex_lock(&spaces_mutex); pthread_mutex_lock(&cl->mutex); log_debug("cmd_acquire %d,%d,%d result %d pid_dead %d", cl_ci, cl_fd, cl_pid, result, cl->pid_dead); pid_dead = cl->pid_dead; cl->cmd_active = 0; if (!result && !pid_dead) { if (check_new_tokens_space(cl, new_tokens, new_tokens_count)) { /* case 1 becomes case 3 */ log_error("cmd_acquire %d,%d,%d invalid lockspace", cl_ci, cl_fd, cl_pid); result = -ENOSPC; } } /* 1. Success acquiring leases, and pid is live */ if (!result && !pid_dead) { for (i = 0; i < new_tokens_count; i++) { for (j = 0; j < SANLK_MAX_RESOURCES; j++) { if (!cl->tokens[j]) { cl->tokens[j] = new_tokens[i]; break; } } } /* goto reply after mutex unlock */ } pthread_mutex_unlock(&cl->mutex); pthread_mutex_unlock(&spaces_mutex); /* 1. Success acquiring leases, and pid is live */ if (!result && !pid_dead) { /* work done before mutex unlock */ goto reply; } /* 2. Success acquiring leases, and pid is dead */ if (!result && pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); release_cl_tokens(task, cl); client_free(cl_ci); result = -ENOTTY; goto reply; } /* 3. Failure acquiring leases, and pid is live */ if (result && !pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); goto reply; } /* 4. Failure acquiring leases, and pid is dead */ if (result && pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); release_cl_tokens(task, cl); client_free(cl_ci); goto reply; } reply: if (!recv_done) client_recv_all(ca->ci_in, &ca->header, pos); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_release(struct task *task, struct cmd_args *ca) { struct client *cl; struct token *token; struct token *rem_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; int fd, rv, i, j, found, pid_dead; int rem_tokens_count = 0; int result = 0; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; log_debug("cmd_release %d,%d,%d ci_in %d fd %d count %d flags %x", cl_ci, cl_fd, cl_pid, ca->ci_in, fd, ca->header.data, ca->header.cmd_flags); /* caller wants to release all resources */ if (ca->header.cmd_flags & SANLK_REL_ALL) { pthread_mutex_lock(&cl->mutex); for (j = 0; j < SANLK_MAX_RESOURCES; j++) { token = cl->tokens[j]; if (!token) continue; rem_tokens[rem_tokens_count++] = token; cl->tokens[j] = NULL; } pthread_mutex_unlock(&cl->mutex); goto do_remove; } /* caller is specifying specific resources to release */ for (i = 0; i < ca->header.data; i++) { rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; break; } found = 0; pthread_mutex_lock(&cl->mutex); for (j = 0; j < SANLK_MAX_RESOURCES; j++) { token = cl->tokens[j]; if (!token) continue; if (memcmp(token->r.lockspace_name, res.lockspace_name, NAME_ID_SIZE)) continue; if (memcmp(token->r.name, res.name, NAME_ID_SIZE)) continue; rem_tokens[rem_tokens_count++] = token; cl->tokens[j] = NULL; found = 1; break; } pthread_mutex_unlock(&cl->mutex); if (!found) { log_error("cmd_release %d,%d,%d no resource %.48s", cl_ci, cl_fd, cl_pid, res.name); result = -1; } } do_remove: for (i = 0; i < rem_tokens_count; i++) { token = rem_tokens[i]; rv = release_token(task, token); if (rv < 0) result = rv; free(token); } pthread_mutex_lock(&cl->mutex); log_debug("cmd_release %d,%d,%d result %d pid_dead %d count %d", cl_ci, cl_fd, cl_pid, result, cl->pid_dead, rem_tokens_count); pid_dead = cl->pid_dead; cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); if (pid_dead) { /* release any tokens not already released above */ release_cl_tokens(task, cl); client_free(cl_ci); } send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_inquire(struct task *task, struct cmd_args *ca) { struct sm_header h; struct token *token; struct client *cl; char *state = NULL, *str; int state_maxlen = 0, state_strlen = 0; int res_count = 0, cat_count = 0; int fd, i, rv, pid_dead; int result = 0; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; log_debug("cmd_inquire %d,%d,%d ci_in %d fd %d", cl_ci, cl_fd, cl_pid, ca->ci_in, fd); pthread_mutex_lock(&cl->mutex); if (cl->pid_dead) { result = -ESTALE; goto done; } for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (cl->tokens[i]) res_count++; } if (!res_count) { result = 0; goto done; } state_maxlen = res_count * (SANLK_MAX_RES_STR + 1); state = malloc(state_maxlen); if (!state) { result = -ENOMEM; goto done; } memset(state, 0, state_maxlen); /* should match sanlock_args_to_state() */ for (i = 0; i < SANLK_MAX_RESOURCES; i++) { token = cl->tokens[i]; if (!token) continue; /* check number of tokens hasn't changed since first count */ if (cat_count >= res_count) { log_error("cmd_inquire %d,%d,%d count changed %d %d", cl_ci, cl_fd, cl_pid, res_count, cat_count); result = -ENOENT; goto done; } str = NULL; rv = sanlock_res_to_str(&token->r, &str); if (rv < 0 || !str) { log_errot(token, "cmd_inquire %d,%d,%d res_to_str %d", cl_ci, cl_fd, cl_pid, rv); result = -ELIBACC; goto done; } if (strlen(str) > SANLK_MAX_RES_STR - 1) { log_errot(token, "cmd_inquire %d,%d,%d strlen %zu", cl_ci, cl_fd, cl_pid, strlen(str)); free(str); result = -ELIBBAD; goto done; } /* space is str separator, so it's invalid within each str */ if (strstr(str, " ")) { log_errot(token, "cmd_inquire %d,%d,%d str space", cl_ci, cl_fd, cl_pid); free(str); result = -ELIBSCN; goto done; } if (cat_count) strcat(state, " "); strcat(state, str); cat_count++; free(str); } state[state_maxlen - 1] = '\0'; state_strlen = strlen(state); result = 0; done: pid_dead = cl->pid_dead; cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); log_debug("cmd_inquire %d,%d,%d result %d pid_dead %d res_count %d cat_count %d strlen %d", cl_ci, cl_fd, cl_pid, result, pid_dead, res_count, cat_count, state_strlen); if (pid_dead) { release_cl_tokens(task, cl); client_free(cl_ci); } memcpy(&h, &ca->header, sizeof(struct sm_header)); h.data = result; h.data2 = res_count; if (state) { h.length = sizeof(h) + state_strlen + 1; send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, state, state_strlen + 1, MSG_NOSIGNAL); free(state); } else { h.length = sizeof(h); send(fd, &h, sizeof(h), MSG_NOSIGNAL); } client_resume(ca->ci_in); } static void cmd_request(struct task *task, struct cmd_args *ca) { struct token *token; struct sanlk_resource res; uint64_t owner_id; uint32_t force_mode; int token_len, disks_len; int j, fd, rv, error, result; fd = client[ca->ci_in].fd; force_mode = ca->header.data; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_request %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); token->acquire_lver = res.lver; token->acquire_data64 = res.data64; token->acquire_data32 = res.data32; token->acquire_flags = res.flags; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply_free; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_debug("cmd_request %d,%d force_mode %u %.48s:%.48s:%.256s:%llu", ca->ci_in, fd, force_mode, token->r.lockspace_name, token->r.name, token->disks[0].path, (unsigned long long)token->r.disks[0].offset); error = request_token(task, token, force_mode, &owner_id); if (error < 0) { result = error; goto reply_free; } result = 0; if (!token->acquire_lver && !force_mode) goto reply_free; if (owner_id) host_status_set_bit(token->r.lockspace_name, owner_id); reply_free: free(token); reply: log_debug("cmd_request %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_examine(struct task *task GNUC_UNUSED, struct cmd_args *ca) { union { struct sanlk_resource r; struct sanlk_lockspace s; } buf; struct sanlk_resource *res = NULL; struct sanlk_lockspace *ls = NULL; char *space_name = NULL; char *res_name = NULL; int fd, rv, result, count = 0, datalen; fd = client[ca->ci_in].fd; if (ca->header.cmd == SM_CMD_EXAMINE_RESOURCE) { datalen = sizeof(struct sanlk_resource); res = &buf.r; } else { datalen = sizeof(struct sanlk_lockspace); ls = &buf.s; } rv = recv(fd, &buf, datalen, MSG_WAITALL); if (rv != datalen) { log_error("cmd_examine %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (res) { space_name = res->lockspace_name; res_name = res->name; } else { space_name = ls->name; } log_debug("cmd_examine %d,%d %.48s %.48s", ca->ci_in, fd, space_name, res_name ? res_name : ""); count = set_resource_examine(space_name, res_name); result = 0; reply: log_debug("cmd_examine %d,%d done %d", ca->ci_in, fd, count); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_add_lockspace(struct cmd_args *ca) { struct sanlk_lockspace lockspace; struct space *sp; int async = ca->header.cmd_flags & SANLK_ADD_ASYNC; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_add_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_debug("cmd_add_lockspace %d,%d %.48s:%llu:%s:%llu flags %x", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset, ca->header.cmd_flags); rv = add_lockspace_start(&lockspace, &sp); if (rv < 0) { result = rv; goto reply; } if (async) { result = rv; log_debug("cmd_add_lockspace %d,%d async done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); add_lockspace_wait(sp); return; } result = add_lockspace_wait(sp); reply: log_debug("cmd_add_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_inq_lockspace(struct cmd_args *ca) { struct sanlk_lockspace lockspace; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_inq_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_debug("cmd_inq_lockspace %d,%d %.48s:%llu:%s:%llu", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset); result = inq_lockspace(&lockspace); reply: log_debug("cmd_inq_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } /* * TODO: rem_lockspace works like a renewal failure would, and abandons * resource leases (tokens) without releasing them. Unlike the renewal * failure case, rem_lockspace most likely releases the host_id. * * What might be nice is an option where rem_lockspace would try to * release resource leases before releasing the lockspace host_id. * (We don't really want to be releasing tokens after we've released * our host_id for the token's lockspace.) * * - kill all pids (by looking at struct resource pid?) * - wait for all pids to exit * o have us or other thread release their tokens/resources * o wait for tokens/resources to be released, although the release * may fail or time out, we don't want to wait too long * - set sp->external_remove * - main_loop sets sp->thread_stop (should find no pids) * - main_loop unlinks watchdog * - lockspace_thread releases host_id * * The aim is that we kill pids and wait for resources to be released * before main_loop gets involved and before the lockspace_thread is * told to stop. * * An alternative messy is to add another condition to the current * main_loop checks: * * if (sp->killing_pids && all_pids_dead(sp) && all_tokens_released(sp)) { * sp->thread_stop = 1; * unlink_watchdog_file(sp); * list_move(spaces_rem); * } * * all_tokens_released would just return 1 in case we're not doing * the releases * * release_token_async would need to learn to put the resources onto * dispose list in this case * * consider using the resources/dispose_resources list for all_pids_dead * and kill_pids? instead of the clients[].tokens[] loops? actually, * could we remove tokens and cl->tokens altogether and just use the * resources list? */ static void cmd_rem_lockspace(struct cmd_args *ca) { struct sanlk_lockspace lockspace; int async = ca->header.cmd_flags & SANLK_REM_ASYNC; int fd, rv, result; unsigned int space_id; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_rem_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_debug("cmd_rem_lockspace %d,%d %.48s flags %x", ca->ci_in, fd, lockspace.name, ca->header.cmd_flags); if (ca->header.cmd_flags & SANLK_REM_UNUSED) { if (lockspace_is_used(&lockspace)) { result = -EBUSY; goto reply; } } rv = rem_lockspace_start(&lockspace, &space_id); if (rv < 0) { result = rv; goto reply; } if (async) { result = rv; log_debug("cmd_rem_lockspace %d,%d async done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); rem_lockspace_wait(&lockspace, space_id); return; } result = rem_lockspace_wait(&lockspace, space_id); reply: log_debug("cmd_rem_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_align(struct task *task GNUC_UNUSED, struct cmd_args *ca) { struct sanlk_disk disk; struct sync_disk sd; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &disk, sizeof(struct sanlk_disk), MSG_WAITALL); if (rv != sizeof(struct sanlk_disk)) { log_error("cmd_align %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_debug("cmd_align %d,%d", ca->ci_in, fd); if (!disk.path[0]) { result = -ENODEV; goto reply; } memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { result = -ENODEV; goto reply; } result = direct_align(&sd); close_disks(&sd, 1); reply: log_debug("cmd_align %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_init_lockspace(struct task *task, struct cmd_args *ca) { struct sanlk_lockspace lockspace; struct sync_disk sd; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_init_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_debug("cmd_init_lockspace %d,%d %.48s:%llu:%s:%llu", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset); if (!lockspace.host_id_disk.path[0]) { result = -ENODEV; goto reply; } memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &lockspace.host_id_disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { result = -ENODEV; goto reply; } result = delta_lease_init(task, &sd, lockspace.name, ca->header.data); close_disks(&sd, 1); reply: log_debug("cmd_init_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_init_resource(struct task *task, struct cmd_args *ca) { struct token *token = NULL; struct sanlk_resource res; int token_len, disks_len; int j, fd, rv, result; fd = client[ca->ci_in].fd; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_init_resource %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_debug("cmd_init_resource %d,%d %.48s:%.48s:%.256s:%llu", ca->ci_in, fd, token->r.lockspace_name, token->r.name, token->disks[0].path, (unsigned long long)token->r.disks[0].offset); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { result = rv; goto reply; } result = paxos_lease_init(task, token, ca->header.data, ca->header.data2); close_disks(token->disks, token->r.num_disks); reply: if (token) free(token); log_debug("cmd_init_resource %d,%d done %d", ca->ci_in, fd, result); send_result(fd, &ca->header, result); client_resume(ca->ci_in); } void call_cmd_thread(struct task *task, struct cmd_args *ca) { switch (ca->header.cmd) { case SM_CMD_ACQUIRE: cmd_acquire(task, ca); break; case SM_CMD_RELEASE: cmd_release(task, ca); break; case SM_CMD_INQUIRE: cmd_inquire(task, ca); break; case SM_CMD_REQUEST: cmd_request(task, ca); break; case SM_CMD_ADD_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "add_lockspace"); cmd_add_lockspace(ca); break; case SM_CMD_INQ_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "inq_lockspace"); cmd_inq_lockspace(ca); break; case SM_CMD_REM_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "rem_lockspace"); cmd_rem_lockspace(ca); break; case SM_CMD_ALIGN: cmd_align(task, ca); break; case SM_CMD_INIT_LOCKSPACE: cmd_init_lockspace(task, ca); break; case SM_CMD_INIT_RESOURCE: cmd_init_resource(task, ca); break; case SM_CMD_EXAMINE_LOCKSPACE: case SM_CMD_EXAMINE_RESOURCE: cmd_examine(task, ca); break; }; } /* * sanlock client status * * 1. send_state_daemon * * 2. for each cl in clients * send_state_client() [sanlk_state + str_len] * * 3. for each sp in spaces, spaces_add, spaces_rem * send_state_lockspace() [sanlk_state + str_len + sanlk_lockspace] * * 4. for each r in resources, dispose_resources * send_state_resource() [sanlk_state + str_len + sanlk_resource + sanlk_disk * num_disks] * * sanlock client host_status * * 1. for each hs in sp->host_status * send_state_host() */ static int print_state_daemon(char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "our_host_name=%s " "use_aio=%d " "io_timeout=%d " "id_renewal=%d " "id_renewal_fail=%d " "id_renewal_warn=%d " "monotime=%llu", our_host_name_global, main_task.use_aio, main_task.io_timeout_seconds, main_task.id_renewal_seconds, main_task.id_renewal_fail_seconds, main_task.id_renewal_warn_seconds, (unsigned long long)monotime()); return strlen(str) + 1; } static int print_state_client(struct client *cl, int ci, char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "ci=%d " "fd=%d " "pid=%d " "restrict=%x " "cmd_active=%d " "cmd_last=%d " "pid_dead=%d " "kill_count=%d " "kill_last=%llu " "suspend=%d " "need_free=%d", ci, cl->fd, cl->pid, cl->restrict, cl->cmd_active, cl->cmd_last, cl->pid_dead, cl->kill_count, (unsigned long long)cl->kill_last, cl->suspend, cl->need_free); return strlen(str) + 1; } static int print_state_lockspace(struct space *sp, char *str, const char *list_name) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "list=%s " "space_id=%u " "host_generation=%llu " "space_dead=%d " "killing_pids=%d " "corrupt_result=%d " "acquire_last_result=%d " "renewal_last_result=%d " "acquire_last_attempt=%llu " "acquire_last_success=%llu " "renewal_last_attempt=%llu " "renewal_last_success=%llu", list_name, sp->space_id, (unsigned long long)sp->host_generation, sp->space_dead, sp->killing_pids, sp->lease_status.corrupt_result, sp->lease_status.acquire_last_result, sp->lease_status.renewal_last_result, (unsigned long long)sp->lease_status.acquire_last_attempt, (unsigned long long)sp->lease_status.acquire_last_success, (unsigned long long)sp->lease_status.renewal_last_attempt, (unsigned long long)sp->lease_status.renewal_last_success); return strlen(str) + 1; } static int print_state_resource(struct resource *r, char *str, const char *list_name, uint32_t token_id) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "list=%s " "flags=%x " "lver=%llu " "token_id=%u", list_name, r->flags, (unsigned long long)r->leader.lver, token_id); return strlen(str) + 1; } static int print_state_host(struct host_status *hs, char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "last_check=%llu " "last_live=%llu " "last_req=%llu " "owner_id=%llu " "owner_generation=%llu " "timestamp=%llu", (unsigned long long)hs->last_check, (unsigned long long)hs->last_live, (unsigned long long)hs->last_req, (unsigned long long)hs->owner_id, (unsigned long long)hs->owner_generation, (unsigned long long)hs->timestamp); return strlen(str) + 1; } static void send_state_daemon(int fd) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); strncpy(st.name, our_host_name_global, NAME_ID_SIZE); st.type = SANLK_STATE_DAEMON; str_len = print_state_daemon(str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void send_state_client(int fd, struct client *cl, int ci) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_CLIENT; st.data32 = cl->pid; strncpy(st.name, cl->owner_name, NAME_ID_SIZE); str_len = print_state_client(cl, ci, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void send_state_lockspace(int fd, struct space *sp, const char *list_name) { struct sanlk_state st; struct sanlk_lockspace lockspace; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_LOCKSPACE; st.data64 = sp->host_id; strncpy(st.name, sp->space_name, NAME_ID_SIZE); str_len = print_state_lockspace(sp, str, list_name); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); memset(&lockspace, 0, sizeof(struct sanlk_lockspace)); strncpy(lockspace.name, sp->space_name, NAME_ID_SIZE); lockspace.host_id = sp->host_id; memcpy(&lockspace.host_id_disk, &sp->host_id_disk, sizeof(struct sanlk_disk)); send(fd, &lockspace, sizeof(lockspace), MSG_NOSIGNAL); } void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id); void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; int i; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_RESOURCE; st.data32 = pid; st.data64 = r->leader.lver; strncpy(st.name, r->r.name, NAME_ID_SIZE); str_len = print_state_resource(r, str, list_name, token_id); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); send(fd, &r->r, sizeof(struct sanlk_resource), MSG_NOSIGNAL); for (i = 0; i < r->r.num_disks; i++) { send(fd, &r->r.disks[i], sizeof(struct sanlk_disk), MSG_NOSIGNAL); } } static void send_state_host(int fd, struct host_status *hs, int host_id) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_HOST; st.data32 = host_id; st.data64 = hs->timestamp; str_len = print_state_host(hs, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void cmd_status(int fd, struct sm_header *h_recv, int client_maxi) { struct sm_header h; struct client *cl; struct space *sp; int ci; memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.length = sizeof(h); h.data = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); send_state_daemon(fd); if (h_recv->data == SANLK_STATE_DAEMON) return; for (ci = 0; ci <= client_maxi; ci++) { cl = &client[ci]; if (!cl->used) continue; send_state_client(fd, cl, ci); } if (h_recv->data == SANLK_STATE_CLIENT) return; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) send_state_lockspace(fd, sp, "spaces"); list_for_each_entry(sp, &spaces_rem, list) send_state_lockspace(fd, sp, "spaces_rem"); list_for_each_entry(sp, &spaces_rem, list) send_state_lockspace(fd, sp, "spaces_add"); pthread_mutex_unlock(&spaces_mutex); if (h_recv->data == SANLK_STATE_LOCKSPACE) return; /* resource.c will iterate through private lists and call back here for each r */ send_state_resources(fd); } static void cmd_host_status(int fd, struct sm_header *h_recv) { struct sm_header h; struct sanlk_lockspace lockspace; struct space *sp; struct host_status *hs, *status = NULL; int status_len; int i, rv; memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.length = sizeof(h); h.data = 0; status_len = sizeof(struct host_status) * DEFAULT_MAX_HOSTS; status = malloc(status_len); if (!status) { h.data = -ENOMEM; goto fail; } rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto fail; } pthread_mutex_lock(&spaces_mutex); sp = find_lockspace(lockspace.name); if (sp) memcpy(status, &sp->host_status, status_len); pthread_mutex_unlock(&spaces_mutex); if (!sp) { h.data = -ENOSPC; goto fail; } send(fd, &h, sizeof(h), MSG_NOSIGNAL); for (i = 0; i < DEFAULT_MAX_HOSTS; i++) { hs = &status[i]; if (!hs->last_live && !hs->owner_id) continue; send_state_host(fd, hs, i+1); } if (status) free(status); return; fail: send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (status) free(status); } static char send_log_dump[LOG_DUMP_SIZE]; static void cmd_log_dump(int fd, struct sm_header *h_recv) { int len; copy_log_dump(send_log_dump, &len); h_recv->data = len; send(fd, h_recv, sizeof(struct sm_header), MSG_NOSIGNAL); send(fd, send_log_dump, len, MSG_NOSIGNAL); } static void cmd_restrict(int ci, int fd, struct sm_header *h_recv) { log_debug("cmd_restrict ci %d fd %d pid %d flags %x", ci, fd, client[ci].pid, h_recv->cmd_flags); client[ci].restrict = h_recv->cmd_flags; send_result(fd, h_recv, 0); } static int get_peer_pid(int fd, int *pid) { struct ucred cred; unsigned int len = sizeof(cred); if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) != 0) return -1; *pid = cred.pid; return 0; } void call_cmd_daemon(int ci, struct sm_header *h_recv, int client_maxi) { int rv, pid, auto_close = 1; int fd = client[ci].fd; switch (h_recv->cmd) { case SM_CMD_REGISTER: rv = get_peer_pid(fd, &pid); if (rv < 0) { log_error("cmd_register ci %d fd %d get pid failed", ci, fd); break; } log_debug("cmd_register ci %d fd %d pid %d", ci, fd, pid); snprintf(client[ci].owner_name, SANLK_NAME_LEN, "%d", pid); client[ci].pid = pid; client[ci].deadfn = client_pid_dead; auto_close = 0; break; case SM_CMD_RESTRICT: cmd_restrict(ci, fd, h_recv); auto_close = 0; break; case SM_CMD_SHUTDOWN: strcpy(client[ci].owner_name, "shutdown"); if (h_recv->data) { /* force */ external_shutdown = 2; } else { pthread_mutex_lock(&spaces_mutex); if (list_empty(&spaces) && list_empty(&spaces_rem) && list_empty(&spaces_add)) external_shutdown = 1; else log_debug("ignore shutdown, lockspace exists"); pthread_mutex_unlock(&spaces_mutex); } break; case SM_CMD_STATUS: strcpy(client[ci].owner_name, "status"); cmd_status(fd, h_recv, client_maxi); break; case SM_CMD_HOST_STATUS: strcpy(client[ci].owner_name, "host_status"); cmd_host_status(fd, h_recv); break; case SM_CMD_LOG_DUMP: strcpy(client[ci].owner_name, "log_dump"); cmd_log_dump(fd, h_recv); break; }; if (auto_close) close(fd); } sanlock-2.2/src/task.c0000644000175100017510000001350711751766670013726 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "task.h" void setup_task_timeouts(struct task *task, int io_timeout_arg) { int io_timeout_seconds = io_timeout_arg; int id_renewal_seconds = 2 * io_timeout_seconds; int id_renewal_fail_seconds = 8 * io_timeout_seconds; int id_renewal_warn_seconds = 6 * io_timeout_seconds; /* those above are chosen by us, the rest are based on them */ int host_dead_seconds = id_renewal_fail_seconds + WATCHDOG_FIRE_TIMEOUT; int delta_large_delay = id_renewal_seconds + (6 * io_timeout_seconds); int delta_short_delay = 2 * io_timeout_seconds; int max = host_dead_seconds; if (delta_large_delay > max) max = delta_large_delay; int delta_acquire_held_max = max + delta_short_delay + (4 * io_timeout_seconds); int delta_acquire_held_min = max; int delta_acquire_free_max = delta_short_delay + (3 * io_timeout_seconds); int delta_acquire_free_min = delta_short_delay; int delta_renew_max = 2 * io_timeout_seconds; int delta_renew_min = 0; int paxos_acquire_held_max = host_dead_seconds + (7 * io_timeout_seconds); int paxos_acquire_held_min = host_dead_seconds; int paxos_acquire_free_max = 6 * io_timeout_seconds; int paxos_acquire_free_min = 0; int request_finish_seconds = 3 * id_renewal_seconds; /* random */ task->io_timeout_seconds = io_timeout_seconds; task->id_renewal_seconds = id_renewal_seconds; task->id_renewal_fail_seconds = id_renewal_fail_seconds; task->id_renewal_warn_seconds = id_renewal_warn_seconds; task->host_dead_seconds = host_dead_seconds; task->request_finish_seconds = request_finish_seconds; /* interval between each kill count is approx 1 sec, so we spend about 10 seconds sending 10 SIGTERMs to a pid, then send SIGKILLs to it. after 60 attempts the watchdog should have fired if the kills are due to failed renewal; otherwise we just give up at that point */ task->kill_count_term = 10; task->kill_count_max = 60; /* the rest are calculated as needed in place */ /* hack to make just main thread log this info */ if (strcmp(task->name, "main")) return; log_debug("io_timeout_seconds %d", io_timeout_seconds); log_debug("id_renewal_seconds %d", id_renewal_seconds); log_debug("id_renewal_fail_seconds %d", id_renewal_fail_seconds); log_debug("id_renewal_warn_seconds %d", id_renewal_warn_seconds); log_debug("host_dead_seconds %d", host_dead_seconds); log_debug("delta_large_delay %d", delta_large_delay); log_debug("delta_short_delay %d", delta_short_delay); log_debug("delta_acquire_held_max %d", delta_acquire_held_max); log_debug("delta_acquire_held_min %d", delta_acquire_held_min); log_debug("delta_acquire_free_max %d", delta_acquire_free_max); log_debug("delta_acquire_free_min %d", delta_acquire_free_min); log_debug("delta_renew_max %d", delta_renew_max); log_debug("delta_renew_min %d", delta_renew_min); log_debug("paxos_acquire_held_max %d", paxos_acquire_held_max); log_debug("paxos_acquire_held_min %d", paxos_acquire_held_min); log_debug("paxos_acquire_free_max %d", paxos_acquire_free_max); log_debug("paxos_acquire_free_min %d", paxos_acquire_free_min); log_debug("request_finish_seconds %d", request_finish_seconds); } void setup_task_aio(struct task *task, int use_aio, int cb_size) { int rv; task->use_aio = use_aio; memset(&task->aio_ctx, 0, sizeof(task->aio_ctx)); /* main task doesn't actually do disk io so it passes in, * cb_size 0, but it still wants use_aio set for other * tasks to copy */ if (!use_aio) return; if (!cb_size) return; rv = io_setup(cb_size, &task->aio_ctx); if (rv < 0) goto fail; task->cb_size = cb_size; task->callbacks = malloc(cb_size * sizeof(struct aicb)); if (!task->callbacks) { rv = -ENOMEM; goto fail_setup; } memset(task->callbacks, 0, cb_size * sizeof(struct aicb)); return; fail_setup: io_destroy(task->aio_ctx); fail: task->use_aio = 0; } void close_task_aio(struct task *task) { struct timespec ts; struct io_event event; uint64_t last_warn; int rv, i, used, warn; if (!task->use_aio) goto skip_aio; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = task->io_timeout_seconds; last_warn = time(NULL); /* wait for all outstanding aio to complete before destroying aio context, freeing iocb and buffers */ while (1) { warn = 0; if (time(NULL) - last_warn >= task->io_timeout_seconds) { last_warn = time(NULL); warn = 1; } used = 0; for (i = 0; i < task->cb_size; i++) { if (!task->callbacks[i].used) continue; used++; if (!warn) continue; log_taske(task, "close_task_aio %d %p busy", i, &task->callbacks[i]); } if (!used) break; memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) continue; if (rv < 0) break; if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); if (ev_aicb->buf == task->iobuf) task->iobuf = NULL; log_taske(task, "aio collect %p:%p:%p result %ld:%ld close free", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); ev_aicb->used = 0; free(ev_aicb->buf); ev_aicb->buf = NULL; } } io_destroy(task->aio_ctx); if (task->iobuf) free(task->iobuf); skip_aio: if (task->callbacks) free(task->callbacks); task->callbacks = NULL; } sanlock-2.2/src/main.c0000644000175100017510000013660011751766670013710 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EXTERN #include "sanlock_internal.h" #include "sanlock_sock.h" #include "sanlock_resource.h" #include "sanlock_admin.h" #include "diskio.h" #include "log.h" #include "lockspace.h" #include "resource.h" #include "direct.h" #include "lockfile.h" #include "watchdog.h" #include "task.h" #include "client_cmd.h" #include "cmd.h" #define RELEASE_VERSION "2.2" struct thread_pool { int num_workers; int max_workers; int free_workers; int quit; struct list_head work_data; pthread_mutex_t mutex; pthread_cond_t cond; pthread_cond_t quit_wait; }; /* priorities are LOG_* from syslog.h */ int log_logfile_priority = LOG_WARNING; int log_syslog_priority = LOG_ERR; int log_stderr_priority = -1; /* -D sets this to LOG_DEBUG */ #define CLIENT_NALLOC 1024 static int client_maxi; static int client_size = 0; static struct pollfd *pollfd; static char command[COMMAND_MAX]; static int cmd_argc; static char **cmd_argv; static struct thread_pool pool; static struct random_data rand_data; static char rand_state[32]; static pthread_mutex_t rand_mutex = PTHREAD_MUTEX_INITIALIZER; /* FIXME: add a mutex for client array so we don't try to expand it while a cmd thread is using it. Or, with a thread pool we know when cmd threads are running and can expand when none are. */ static int client_alloc(void) { int i; client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); if (!client || !pollfd) { log_error("can't alloc for client or pollfd array"); return -ENOMEM; } for (i = 0; i < CLIENT_NALLOC; i++) { memset(&client[i], 0, sizeof(struct client)); memset(&pollfd[i], 0, sizeof(struct pollfd)); pthread_mutex_init(&client[i].mutex, NULL); client[i].fd = -1; client[i].pid = -1; pollfd[i].fd = -1; pollfd[i].events = 0; } client_size = CLIENT_NALLOC; return 0; } static void _client_free(int ci) { struct client *cl = &client[ci]; if (!cl->used) { /* should never happen */ log_error("client_free ci %d not used", ci); goto out; } if (cl->pid != -1) { /* client_pid_dead() should have set pid to -1 */ /* should never happen */ log_error("client_free ci %d live pid %d", ci, cl->pid); goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_free ci %d is free", ci); goto out; } if (cl->suspend) { log_debug("client_free ci %d is suspended", ci); cl->need_free = 1; goto out; } if (cl->fd != -1) close(cl->fd); cl->used = 0; cl->fd = -1; cl->pid = -1; cl->cmd_active = 0; cl->pid_dead = 0; cl->suspend = 0; cl->need_free = 0; cl->kill_count = 0; cl->kill_last = 0; cl->restrict = 0; memset(cl->owner_name, 0, sizeof(cl->owner_name)); cl->workfn = NULL; cl->deadfn = NULL; memset(cl->tokens, 0, sizeof(struct token *) * SANLK_MAX_RESOURCES); /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; out: return; } void client_free(int ci); void client_free(int ci) { struct client *cl = &client[ci]; pthread_mutex_lock(&cl->mutex); _client_free(ci); pthread_mutex_unlock(&cl->mutex); } /* the connection that we suspend and resume may or may not be the same connection as the target client where we set cmd_active */ static int client_suspend(int ci) { struct client *cl = &client[ci]; int rv = 0; pthread_mutex_lock(&cl->mutex); if (!cl->used) { /* should never happen */ log_error("client_suspend ci %d not used", ci); rv = -1; goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_suspend ci %d is free", ci); rv = -1; goto out; } if (cl->suspend) { /* should never happen */ log_error("client_suspend ci %d is suspended", ci); rv = -1; goto out; } cl->suspend = 1; /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; out: pthread_mutex_unlock(&cl->mutex); return rv; } void client_resume(int ci); void client_resume(int ci) { struct client *cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) { /* should never happen */ log_error("client_resume ci %d not used", ci); goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_resume ci %d is free", ci); goto out; } if (!cl->suspend) { /* should never happen */ log_error("client_resume ci %d not suspended", ci); goto out; } cl->suspend = 0; if (cl->need_free) { log_debug("client_resume ci %d need_free", ci); _client_free(ci); } else { /* make poll() watch this connection */ pollfd[ci].fd = cl->fd; pollfd[ci].events = POLLIN; } out: pthread_mutex_unlock(&cl->mutex); } static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { struct client *cl; int i; for (i = 0; i < client_size; i++) { cl = &client[i]; pthread_mutex_lock(&cl->mutex); if (!cl->used) { cl->used = 1; cl->fd = fd; cl->workfn = workfn; cl->deadfn = deadfn ? deadfn : client_free; /* make poll() watch this connection */ pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; pthread_mutex_unlock(&cl->mutex); return i; } pthread_mutex_unlock(&cl->mutex); } return -1; } /* clear the unreceived portion of an aborted command */ void client_recv_all(int ci, struct sm_header *h_recv, int pos); void client_recv_all(int ci, struct sm_header *h_recv, int pos) { char trash[64]; int rem = h_recv->length - sizeof(struct sm_header) - pos; int rv, error = 0, total = 0; if (!rem) return; while (1) { rv = recv(client[ci].fd, trash, sizeof(trash), MSG_DONTWAIT); if (rv == -1) error = errno; if (rv <= 0) break; total += rv; if (total >= rem) break; } log_debug("recv_all %d,%d,%d pos %d rv %d error %d rem %d total %d", ci, client[ci].fd, client[ci].pid, pos, rv, error, rem, total); } void send_result(int fd, struct sm_header *h_recv, int result); void send_result(int fd, struct sm_header *h_recv, int result) { struct sm_header h; memcpy(&h, h_recv, sizeof(struct sm_header)); h.length = sizeof(h); h.data = result; h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); } void client_pid_dead(int ci); void client_pid_dead(int ci) { struct client *cl = &client[ci]; int cmd_active; int i, pid; /* cmd_acquire_thread may still be waiting for the tokens to be acquired. if it is, cl->pid_dead tells it to release them when finished. Similarly, cmd_release_thread, cmd_inquire_thread are accessing cl->tokens */ pthread_mutex_lock(&cl->mutex); if (!cl->used || cl->fd == -1 || cl->pid == -1) { /* should never happen */ pthread_mutex_unlock(&cl->mutex); log_error("client_pid_dead %d,%d,%d u %d a %d s %d bad state", ci, cl->fd, cl->pid, cl->used, cl->cmd_active, cl->suspend); return; } log_debug("client_pid_dead %d,%d,%d cmd_active %d suspend %d", ci, cl->fd, cl->pid, cl->cmd_active, cl->suspend); cmd_active = cl->cmd_active; pid = cl->pid; cl->pid = -1; cl->pid_dead = 1; /* when cmd_active is set and cmd_a,r,i_thread is done and takes cl->mutex to set cl->cmd_active to 0, it will see cl->pid_dead is 1 and know they need to release cl->tokens and call client_free */ /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; pthread_mutex_unlock(&cl->mutex); kill(pid, SIGKILL); if (cmd_active) { log_debug("client_pid_dead %d,%d,%d defer to cmd %d", ci, cl->fd, pid, cmd_active); return; } /* use async release here because this is the main thread that we don't want to block doing disk lease i/o */ pthread_mutex_lock(&cl->mutex); for (i = 0; i < SANLK_MAX_RESOURCES; i++) { if (cl->tokens[i]) { release_token_async(cl->tokens[i]); free(cl->tokens[i]); } } _client_free(ci); pthread_mutex_unlock(&cl->mutex); } /* At some point we may want to keep a record of each pid using a lockspace in the sp struct to avoid walking through each client's cl->tokens to see if it's using the lockspace. It should be the uncommon situation where a lockspace renewal fails and we need to walk through all client tokens like this. i.e. we'd probably not want to optimize for this case at the expense of the more common case where a pid exits, but we do want it to be robust. The locking is also made a bit ugly by these three routines that need to correlate which clients are using which lockspaces. (client_using_space, kill_pids, all_pids_dead) spaces_mutex is held when they are called, and they need to take cl->mutex. This means that cmd_acquire_thread has to lock both spaces_mutex and cl->mutex when adding new tokens to the client. (It needs to check that the lockspace for the new tokens hasn't failed while the tokens were being acquired.) In kill_pids and all_pids_dead could we check cl->pid <= 0 without taking cl->mutex, since client_pid_dead in the main thread is the only place that changes that? */ static int client_using_space(struct client *cl, struct space *sp) { struct token *token; int i, rv = 0; for (i = 0; i < SANLK_MAX_RESOURCES; i++) { token = cl->tokens[i]; if (!token) continue; if (strncmp(token->r.lockspace_name, sp->space_name, NAME_ID_SIZE)) continue; if (!cl->kill_count) log_spoke(sp, token, "client_using_space pid %d", cl->pid); if (sp->space_dead) token->flags |= T_LS_DEAD; rv = 1; } return rv; } /* TODO: try killscript first if one is provided */ static void kill_pids(struct space *sp) { struct client *cl; uint64_t now; int ci, fd, pid, sig; int do_kill; /* * all remaining pids using sp are stuck, we've made max attempts to * kill all, don't bother cycling through them */ if (sp->killing_pids > 1) return; now = monotime(); for (ci = 0; ci <= client_maxi; ci++) { do_kill = 0; cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) goto unlock; if (cl->pid <= 0) goto unlock; /* NB this cl may not be using sp, but trying to avoid the expensive client_using_space check */ if (cl->kill_count >= main_task.kill_count_max) goto unlock; if (cl->kill_count && (now - cl->kill_last < 1)) goto unlock; if (!client_using_space(cl, sp)) goto unlock; cl->kill_last = now; cl->kill_count++; fd = cl->fd; pid = cl->pid; if (cl->restrict & SANLK_RESTRICT_SIGKILL) sig = SIGTERM; else if (cl->restrict & SANLK_RESTRICT_SIGTERM) sig = SIGKILL; else if (cl->kill_count <= main_task.kill_count_term) sig = SIGTERM; else sig = SIGKILL; do_kill = 1; unlock: pthread_mutex_unlock(&cl->mutex); if (!do_kill) continue; if (cl->kill_count == main_task.kill_count_max) { log_erros(sp, "kill %d,%d,%d sig %d count %d final attempt", ci, fd, pid, sig, cl->kill_count); } else { log_space(sp, "kill %d,%d,%d sig %d count %d", ci, fd, pid, sig, cl->kill_count); } kill(pid, sig); } } static int all_pids_dead(struct space *sp) { struct client *cl; int stuck = 0, check = 0; int ci; for (ci = 0; ci <= client_maxi; ci++) { cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) goto unlock; if (cl->pid <= 0) goto unlock; if (!client_using_space(cl, sp)) goto unlock; if (cl->kill_count >= main_task.kill_count_max) stuck++; else check++; unlock: pthread_mutex_unlock(&cl->mutex); } if (stuck && !check && sp->killing_pids < 2) { log_erros(sp, "killing pids stuck %d", stuck); /* cause kill_pids to give up */ sp->killing_pids = 2; } if (stuck || check) return 0; log_space(sp, "used by no pids"); return 1; } static unsigned int time_diff(struct timeval *begin, struct timeval *end) { struct timeval result; timersub(end, begin, &result); return (result.tv_sec * 1000) + (result.tv_usec / 1000); } #define STANDARD_CHECK_INTERVAL 1000 /* milliseconds */ #define RECOVERY_CHECK_INTERVAL 200 /* milliseconds */ static int main_loop(void) { void (*workfn) (int ci); void (*deadfn) (int ci); struct space *sp, *safe; struct timeval now, last_check; int poll_timeout, check_interval; unsigned int ms; int i, rv, empty, check_all; char *check_buf = NULL; int check_buf_len = 0; gettimeofday(&last_check, NULL); poll_timeout = STANDARD_CHECK_INTERVAL; check_interval = STANDARD_CHECK_INTERVAL; while (1) { rv = poll(pollfd, client_maxi + 1, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) { /* not sure */ } for (i = 0; i <= client_maxi; i++) { if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; if (workfn) workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { deadfn = client[i].deadfn; if (deadfn) deadfn(i); } } gettimeofday(&now, NULL); ms = time_diff(&last_check, &now); if (ms < check_interval) { poll_timeout = check_interval - ms; continue; } last_check = now; check_interval = STANDARD_CHECK_INTERVAL; /* * check the condition of each lockspace, * if pids are being killed, have pids all exited? * is its host_id being renewed?, if not kill pids */ pthread_mutex_lock(&spaces_mutex); list_for_each_entry_safe(sp, safe, &spaces, list) { if (sp->killing_pids && all_pids_dead(sp)) { /* * move sp to spaces_rem so main_loop * will no longer see it. */ log_space(sp, "set thread_stop"); pthread_mutex_lock(&sp->mutex); sp->thread_stop = 1; unlink_watchdog_file(sp); pthread_mutex_unlock(&sp->mutex); list_move(&sp->list, &spaces_rem); continue; } if (sp->killing_pids) { /* * continue to kill the pids with increasing * levels of severity until they all exit */ kill_pids(sp); check_interval = RECOVERY_CHECK_INTERVAL; continue; } /* * check host_id lease renewal */ if (sp->align_size > check_buf_len) { if (check_buf) free(check_buf); check_buf_len = sp->align_size; check_buf = malloc(check_buf_len); } if (check_buf) memset(check_buf, 0, check_buf_len); check_all = 0; rv = check_our_lease(&main_task, sp, &check_all, check_buf); if (rv || sp->external_remove || (external_shutdown > 1)) { log_space(sp, "set killing_pids check %d remove %d", rv, sp->external_remove); sp->space_dead = 1; sp->killing_pids = 1; kill_pids(sp); check_interval = RECOVERY_CHECK_INTERVAL; } else if (check_all) { check_other_leases(&main_task, sp, check_buf); } } empty = list_empty(&spaces); pthread_mutex_unlock(&spaces_mutex); if (external_shutdown && empty) break; if (external_shutdown == 1) { log_debug("ignore shutdown, lockspace exists"); external_shutdown = 0; } free_lockspaces(0); gettimeofday(&now, NULL); ms = time_diff(&last_check, &now); if (ms < check_interval) poll_timeout = check_interval - ms; else poll_timeout = 1; } free_lockspaces(1); return 0; } static void *thread_pool_worker(void *data) { struct task task; struct cmd_args *ca; memset(&task, 0, sizeof(struct task)); setup_task_timeouts(&task, main_task.io_timeout_seconds); setup_task_aio(&task, main_task.use_aio, WORKER_AIO_CB_SIZE); snprintf(task.name, NAME_ID_SIZE, "worker%ld", (long)data); pthread_mutex_lock(&pool.mutex); while (1) { while (!pool.quit && list_empty(&pool.work_data)) { pool.free_workers++; pthread_cond_wait(&pool.cond, &pool.mutex); pool.free_workers--; } while (!list_empty(&pool.work_data)) { ca = list_first_entry(&pool.work_data, struct cmd_args, list); list_del(&ca->list); pthread_mutex_unlock(&pool.mutex); call_cmd_thread(&task, ca); free(ca); pthread_mutex_lock(&pool.mutex); } if (pool.quit) break; } pool.num_workers--; if (!pool.num_workers) pthread_cond_signal(&pool.quit_wait); pthread_mutex_unlock(&pool.mutex); close_task_aio(&task); return NULL; } static int thread_pool_add_work(struct cmd_args *ca) { pthread_t th; int rv; pthread_mutex_lock(&pool.mutex); if (pool.quit) { pthread_mutex_unlock(&pool.mutex); return -1; } list_add_tail(&ca->list, &pool.work_data); if (!pool.free_workers && pool.num_workers < pool.max_workers) { rv = pthread_create(&th, NULL, thread_pool_worker, (void *)(long)pool.num_workers); if (rv < 0) { list_del(&ca->list); pthread_mutex_unlock(&pool.mutex); return rv; } pool.num_workers++; } pthread_cond_signal(&pool.cond); pthread_mutex_unlock(&pool.mutex); return 0; } static void thread_pool_free(void) { pthread_mutex_lock(&pool.mutex); pool.quit = 1; if (pool.num_workers > 0) { pthread_cond_broadcast(&pool.cond); pthread_cond_wait(&pool.quit_wait, &pool.mutex); } pthread_mutex_unlock(&pool.mutex); } static int thread_pool_create(int min_workers, int max_workers) { pthread_t th; int i, rv; memset(&pool, 0, sizeof(pool)); INIT_LIST_HEAD(&pool.work_data); pthread_mutex_init(&pool.mutex, NULL); pthread_cond_init(&pool.cond, NULL); pthread_cond_init(&pool.quit_wait, NULL); pool.max_workers = max_workers; for (i = 0; i < min_workers; i++) { rv = pthread_create(&th, NULL, thread_pool_worker, (void *)(long)i); if (rv < 0) break; pool.num_workers++; } if (rv < 0) thread_pool_free(); return rv; } /* cmd comes from a transient client/fd set up just to pass the cmd, and is not being done on behalf of another registered client/fd */ static void process_cmd_thread_unregistered(int ci_in, struct sm_header *h_recv) { struct cmd_args *ca; int rv; ca = malloc(sizeof(struct cmd_args)); if (!ca) { rv = -ENOMEM; goto fail; } ca->ci_in = ci_in; memcpy(&ca->header, h_recv, sizeof(struct sm_header)); snprintf(client[ci_in].owner_name, SANLK_NAME_LEN, "cmd%d", h_recv->cmd); rv = thread_pool_add_work(ca); if (rv < 0) goto fail_free; return; fail_free: free(ca); fail: send_result(client[ci_in].fd, h_recv, rv); close(client[ci_in].fd); } /* cmd either comes from a registered client/fd, or is targeting a registered client/fd */ static void process_cmd_thread_registered(int ci_in, struct sm_header *h_recv) { struct cmd_args *ca; struct client *cl; int result = 0; int rv, i, ci_target; ca = malloc(sizeof(struct cmd_args)); if (!ca) { result = -ENOMEM; goto fail; } if (h_recv->data2 != -1) { /* lease for another registered client with pid specified by data2 */ ci_target = -1; for (i = 0; i < client_size; i++) { cl = &client[i]; pthread_mutex_lock(&cl->mutex); if (cl->pid != h_recv->data2) { pthread_mutex_unlock(&cl->mutex); continue; } ci_target = i; break; } if (ci_target < 0) { result = -ESRCH; goto fail; } } else { /* lease for this registered client */ ci_target = ci_in; cl = &client[ci_target]; pthread_mutex_lock(&cl->mutex); } if (!cl->used) { log_error("cmd %d %d,%d,%d not used", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->pid <= 0) { log_error("cmd %d %d,%d,%d no pid", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->pid_dead) { log_error("cmd %d %d,%d,%d pid_dead", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->need_free) { log_error("cmd %d %d,%d,%d need_free", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->kill_count) { log_error("cmd %d %d,%d,%d kill_count %d", h_recv->cmd, ci_target, cl->fd, cl->pid, cl->kill_count); result = -EBUSY; goto out; } if (cl->cmd_active) { if (com.quiet_fail && cl->cmd_active == SM_CMD_ACQUIRE) { result = -EBUSY; goto out; } log_error("cmd %d %d,%d,%d cmd_active %d", h_recv->cmd, ci_target, cl->fd, cl->pid, cl->cmd_active); result = -EBUSY; goto out; } cl->cmd_active = h_recv->cmd; /* once cmd_active is set, client_pid_dead() will not clear cl->tokens or call client_free, so it's the responsiblity of cmd_a,r,i_thread to check if pid_dead when clearing cmd_active, and doing the cleanup if pid is dead */ out: pthread_mutex_unlock(&cl->mutex); if (result < 0) goto fail; ca->ci_in = ci_in; ca->ci_target = ci_target; ca->cl_pid = cl->pid; ca->cl_fd = cl->fd; memcpy(&ca->header, h_recv, sizeof(struct sm_header)); rv = thread_pool_add_work(ca); if (rv < 0) { /* we don't have to worry about client_pid_dead having been called while mutex was unlocked with cmd_active set, because client_pid_dead is called from the main thread which is running this function */ log_error("create cmd thread failed"); pthread_mutex_lock(&cl->mutex); cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); result = rv; goto fail; } return; fail: client_recv_all(ci_in, h_recv, 0); send_result(client[ci_in].fd, h_recv, result); client_resume(ci_in); if (ca) free(ca); } static void process_connection(int ci) { struct sm_header h; void (*deadfn)(int ci); int rv; memset(&h, 0, sizeof(h)); rv = recv(client[ci].fd, &h, sizeof(h), MSG_WAITALL); if (!rv) return; if (rv < 0) { log_error("ci %d fd %d pid %d recv errno %d", ci, client[ci].fd, client[ci].pid, errno); goto dead; } if (rv != sizeof(h)) { log_error("ci %d fd %d pid %d recv size %d", ci, client[ci].fd, client[ci].pid, rv); goto dead; } if (h.magic != SM_MAGIC) { log_error("ci %d recv %d magic %x vs %x", ci, rv, h.magic, SM_MAGIC); goto dead; } if (client[ci].restrict & SANLK_RESTRICT_ALL) { log_error("ci %d fd %d pid %d cmd %d restrict all", ci, client[ci].fd, client[ci].pid, h.cmd); goto dead; } client[ci].cmd_last = h.cmd; switch (h.cmd) { case SM_CMD_REGISTER: case SM_CMD_RESTRICT: case SM_CMD_SHUTDOWN: case SM_CMD_STATUS: case SM_CMD_HOST_STATUS: case SM_CMD_LOG_DUMP: call_cmd_daemon(ci, &h, client_maxi); break; case SM_CMD_ADD_LOCKSPACE: case SM_CMD_INQ_LOCKSPACE: case SM_CMD_REM_LOCKSPACE: case SM_CMD_REQUEST: case SM_CMD_EXAMINE_RESOURCE: case SM_CMD_EXAMINE_LOCKSPACE: case SM_CMD_ALIGN: case SM_CMD_INIT_LOCKSPACE: case SM_CMD_INIT_RESOURCE: rv = client_suspend(ci); if (rv < 0) return; process_cmd_thread_unregistered(ci, &h); break; case SM_CMD_ACQUIRE: case SM_CMD_RELEASE: case SM_CMD_INQUIRE: /* the main_loop needs to ignore this connection while the thread is working on it */ rv = client_suspend(ci); if (rv < 0) return; process_cmd_thread_registered(ci, &h); break; default: log_error("ci %d cmd %d unknown", ci, h.cmd); }; return; dead: deadfn = client[ci].deadfn; if (deadfn) deadfn(ci); } static void process_listener(int ci GNUC_UNUSED) { int fd; int on = 1; fd = accept(client[ci].fd, NULL, NULL); if (fd < 0) return; setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)); client_add(fd, process_connection, NULL); } static int setup_listener(void) { struct sockaddr_un addr; int rv, fd, ci; rv = sanlock_socket_address(&addr); if (rv < 0) return rv; fd = socket(AF_LOCAL, SOCK_STREAM, 0); if (fd < 0) return fd; unlink(addr.sun_path); rv = bind(fd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) goto exit_fail; rv = chmod(addr.sun_path, DEFAULT_SOCKET_MODE); if (rv < 0) goto exit_fail; rv = chown(addr.sun_path, com.uid, com.gid); if (rv < 0) { log_error("could not set socket %s permissions: %s", addr.sun_path, strerror(errno)); goto exit_fail; } rv = listen(fd, 5); if (rv < 0) goto exit_fail; fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK); ci = client_add(fd, process_listener, NULL); if (ci < 0) goto exit_fail; strcpy(client[ci].owner_name, "listener"); return 0; exit_fail: close(fd); return -1; } static void sigterm_handler(int sig GNUC_UNUSED) { external_shutdown = 1; } static void setup_priority(void) { struct sched_param sched_param; int rv; if (!com.high_priority) return; rv = mlockall(MCL_CURRENT | MCL_FUTURE); if (rv < 0) { log_error("mlockall failed: %s", strerror(errno)); } rv = sched_get_priority_max(SCHED_RR); if (rv < 0) { log_error("could not get max scheduler priority err %d", errno); return; } sched_param.sched_priority = rv; rv = sched_setscheduler(0, SCHED_RR|SCHED_RESET_ON_FORK, &sched_param); if (rv < 0) { log_error("set scheduler RR|RESET_ON_FORK priority %d failed: %s", sched_param.sched_priority, strerror(errno)); } } /* return a random int between a and b inclusive */ int get_rand(int a, int b); int get_rand(int a, int b) { int32_t val; int rv; pthread_mutex_lock(&rand_mutex); rv = random_r(&rand_data, &val); pthread_mutex_unlock(&rand_mutex); if (rv < 0) return rv; return a + (int) (((float)(b - a + 1)) * val / (RAND_MAX+1.0)); } static void setup_host_name(void) { struct utsname name; char uuid[37]; uuid_t uu; memset(rand_state, 0, sizeof(rand_state)); memset(&rand_data, 0, sizeof(rand_data)); initstate_r(time(NULL), rand_state, sizeof(rand_state), &rand_data); /* use host name from command line */ if (com.our_host_name[0]) { memcpy(our_host_name_global, com.our_host_name, SANLK_NAME_LEN); return; } /* make up something that's likely to be different among hosts */ memset(&our_host_name_global, 0, sizeof(our_host_name_global)); memset(&name, 0, sizeof(name)); memset(&uuid, 0, sizeof(uuid)); uname(&name); uuid_generate(uu); uuid_unparse_lower(uu, uuid); snprintf(our_host_name_global, NAME_ID_SIZE, "%s.%s", uuid, name.nodename); } static int do_daemon(void) { struct sigaction act; int fd, rv; /* TODO: copy comprehensive daemonization method from libvirtd */ if (!com.debug) { if (daemon(0, 0) < 0) { log_tool("cannot fork daemon\n"); exit(EXIT_FAILURE); } umask(0); } /* main task never does disk io, so we don't really need to set * it up, but other tasks get their use_aio value by copying * the main_task settings */ sprintf(main_task.name, "%s", "main"); setup_task_timeouts(&main_task, com.io_timeout_arg); setup_task_aio(&main_task, com.aio_arg, 0); rv = client_alloc(); if (rv < 0) return rv; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; rv = sigaction(SIGTERM, &act, NULL); if (rv < 0) return rv; fd = lockfile(SANLK_RUN_DIR, SANLK_LOCKFILE_NAME); if (fd < 0) return fd; setup_logging(); setup_host_name(); log_error("sanlock daemon started %s aio %d %d renew %d %d host %s time %llu", RELEASE_VERSION, main_task.use_aio, main_task.io_timeout_seconds, main_task.id_renewal_seconds, main_task.id_renewal_fail_seconds, our_host_name_global, (unsigned long long)time(NULL)); setup_priority(); rv = thread_pool_create(DEFAULT_MIN_WORKER_THREADS, com.max_worker_threads); if (rv < 0) goto out_logging; rv = setup_watchdog(); if (rv < 0) goto out_threads; rv = setup_listener(); if (rv < 0) goto out_threads; setup_token_manager(); if (rv < 0) goto out_threads; main_loop(); close_token_manager(); close_watchdog(); out_threads: thread_pool_free(); out_logging: close_logging(); unlink_lockfile(fd, SANLK_RUN_DIR, SANLK_LOCKFILE_NAME); return rv; } static int user_to_uid(char *arg) { struct passwd *pw; pw = getpwnam(arg); if (pw == NULL) { log_error("user '%s' not found, " "using uid: %i", arg, DEFAULT_SOCKET_UID); return DEFAULT_SOCKET_UID; } return pw->pw_uid; } static int group_to_gid(char *arg) { struct group *gr; gr = getgrnam(arg); if (gr == NULL) { log_error("group '%s' not found, " "using uid: %i", arg, DEFAULT_SOCKET_GID); return DEFAULT_SOCKET_GID; } return gr->gr_gid; } static int parse_arg_lockspace(char *arg) { sanlock_str_to_lockspace(arg, &com.lockspace); log_debug("lockspace %s host_id %llu path %s offset %llu", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, (unsigned long long)com.lockspace.host_id_disk.offset); return 0; } static int parse_arg_resource(char *arg) { struct sanlk_resource *res; int rv, i; if (com.res_count >= SANLK_MAX_RESOURCES) { log_tool("resource args over max %d", SANLK_MAX_RESOURCES); return -1; } rv = sanlock_str_to_res(arg, &res); if (rv < 0) { log_tool("resource arg parse error %d\n", rv); return rv; } com.res_args[com.res_count] = res; com.res_count++; log_debug("resource %s %s num_disks %d flags %x lver %llu", res->lockspace_name, res->name, res->num_disks, res->flags, (unsigned long long)res->lver); for (i = 0; i < res->num_disks; i++) { log_debug("resource disk %s %llu", res->disks[i].path, (unsigned long long)res->disks[i].offset); } return 0; } /* * daemon: acquires leases for the local host_id, associates them with a local * pid, and releases them when the associated pid exits. * * client: ask daemon to acquire/release leases associated with a given pid. * * direct: acquires and releases leases directly for the local host_id by * reading and writing storage directly. */ static void print_usage(void) { printf("Usage:\n"); printf("sanlock ...\n\n"); printf("commands:\n"); printf(" daemon start daemon\n"); printf(" client send request to daemon (default type if none given)\n"); printf(" direct access storage directly (no coordination with daemon)\n"); printf(" help print this usage (defaults in parens)\n"); printf(" version print version\n"); printf("\n"); printf("sanlock daemon [options]\n"); printf(" -D no fork and print all logging to stderr\n"); printf(" -Q 0|1 quiet error messages for common lock contention (0)\n"); printf(" -R 0|1 renewal debugging, log debug info about renewals (0)\n"); printf(" -L write logging at priority level and up to logfile (3 LOG_ERR))\n"); printf(" (use -1 for none)\n"); printf(" -S write logging at priority level and up to syslog (3 LOG_ERR)\n"); printf(" (use -1 for none)\n"); printf(" -U user id\n"); printf(" -G group id\n"); printf(" -t max worker threads (%d)\n", DEFAULT_MAX_WORKER_THREADS); printf(" -w 0|1 use watchdog through wdmd (%d)\n", DEFAULT_USE_WATCHDOG); printf(" -h 0|1 use high priority features (%d)\n", DEFAULT_HIGH_PRIORITY); printf(" (realtime scheduling, mlockall)\n"); printf(" -a 0|1 use async io (%d)\n", DEFAULT_USE_AIO); printf(" -o 0|1 io timeout in seconds (%d)\n", DEFAULT_IO_TIMEOUT); printf("\n"); printf("sanlock client [options]\n"); printf("sanlock client status [-D] [-o p|s]\n"); printf("sanlock client host_status -s LOCKSPACE [-D]\n"); printf("sanlock client log_dump\n"); printf("sanlock client shutdown [-f 0|1]\n"); printf("sanlock client init -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock client align -s LOCKSPACE\n"); printf("sanlock client add_lockspace -s LOCKSPACE\n"); printf("sanlock client inq_lockspace -s LOCKSPACE\n"); printf("sanlock client rem_lockspace -s LOCKSPACE\n"); printf("sanlock client command -r RESOURCE -c \n"); printf("sanlock client acquire -r RESOURCE -p \n"); printf("sanlock client release -r RESOURCE -p \n"); printf("sanlock client inquire -p \n"); printf("sanlock client request -r RESOURCE -f \n"); printf("sanlock client examine -r RESOURCE | -s LOCKSPACE\n"); printf("\n"); printf("sanlock direct [-a 0|1] [-o 0|1]\n"); printf("sanlock direct init -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct read_leader -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct read_id -s LOCKSPACE\n"); printf("sanlock direct live_id -s LOCKSPACE\n"); printf("sanlock direct dump [:]\n"); printf("\n"); printf("LOCKSPACE = :::\n"); printf(" name of lockspace\n"); printf(" local host identifier in lockspace\n"); printf(" disk to storage reserved for leases\n"); printf(" offset on path (bytes)\n"); printf("\n"); printf("RESOURCE = :::[:]\n"); printf(" name of lockspace\n"); printf(" name of resource\n"); printf(" disk to storage reserved for leases\n"); printf(" offset on path (bytes)\n"); printf(" optional leader version or SH for shared lease\n"); printf("\n"); printf("Limits:\n"); printf("offset alignment with 512 byte sectors: %d (1MB)\n", 1024 * 1024); printf("offset alignment with 4096 byte sectors: %d (8MB)\n", 1024 * 1024 * 8); printf("maximum name length for lockspaces and resources: %d\n", SANLK_NAME_LEN); printf("maximum path length: %d\n", SANLK_PATH_LEN); printf("maximum host_id: %d\n", DEFAULT_MAX_HOSTS); printf("maximum client process connections: 1000\n"); /* NALLOC */ printf("\n"); } static int read_command_line(int argc, char *argv[]) { char optchar; char *optionarg; char *p; char *arg1 = argv[1]; char *act; int i, j, len, begin_command = 0; if (argc < 2 || !strcmp(arg1, "help") || !strcmp(arg1, "--help") || !strcmp(arg1, "-h")) { print_usage(); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "version") || !strcmp(arg1, "--version") || !strcmp(arg1, "-V")) { printf("%s %s (built %s %s)\n", argv[0], RELEASE_VERSION, __DATE__, __TIME__); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "daemon")) { com.type = COM_DAEMON; i = 2; } else if (!strcmp(arg1, "direct")) { com.type = COM_DIRECT; if (argc < 3) { print_usage(); exit(EXIT_FAILURE); } act = argv[2]; i = 3; } else if (!strcmp(arg1, "client")) { com.type = COM_CLIENT; if (argc < 3) { print_usage(); exit(EXIT_FAILURE); } act = argv[2]; i = 3; } else { com.type = COM_CLIENT; act = argv[1]; i = 2; } switch (com.type) { case COM_DAEMON: break; case COM_CLIENT: if (!strcmp(act, "status")) com.action = ACT_STATUS; else if (!strcmp(act, "host_status")) com.action = ACT_HOST_STATUS; else if (!strcmp(act, "log_dump")) com.action = ACT_LOG_DUMP; else if (!strcmp(act, "shutdown")) com.action = ACT_SHUTDOWN; else if (!strcmp(act, "add_lockspace")) com.action = ACT_ADD_LOCKSPACE; else if (!strcmp(act, "inq_lockspace")) com.action = ACT_INQ_LOCKSPACE; else if (!strcmp(act, "rem_lockspace")) com.action = ACT_REM_LOCKSPACE; else if (!strcmp(act, "command")) com.action = ACT_COMMAND; else if (!strcmp(act, "acquire")) com.action = ACT_ACQUIRE; else if (!strcmp(act, "release")) com.action = ACT_RELEASE; else if (!strcmp(act, "inquire")) com.action = ACT_INQUIRE; else if (!strcmp(act, "request")) com.action = ACT_REQUEST; else if (!strcmp(act, "examine")) com.action = ACT_EXAMINE; else if (!strcmp(act, "align")) com.action = ACT_CLIENT_ALIGN; else if (!strcmp(act, "init")) com.action = ACT_CLIENT_INIT; else { log_tool("client action \"%s\" is unknown", act); exit(EXIT_FAILURE); } break; case COM_DIRECT: if (!strcmp(act, "init")) com.action = ACT_DIRECT_INIT; else if (!strcmp(act, "dump")) com.action = ACT_DUMP; else if (!strcmp(act, "read_leader")) com.action = ACT_READ_LEADER; else if (!strcmp(act, "acquire")) com.action = ACT_ACQUIRE; else if (!strcmp(act, "release")) com.action = ACT_RELEASE; else if (!strcmp(act, "acquire_id")) com.action = ACT_ACQUIRE_ID; else if (!strcmp(act, "release_id")) com.action = ACT_RELEASE_ID; else if (!strcmp(act, "renew_id")) com.action = ACT_RENEW_ID; else if (!strcmp(act, "read_id")) com.action = ACT_READ_ID; else if (!strcmp(act, "live_id")) com.action = ACT_LIVE_ID; else { log_tool("direct action \"%s\" is unknown", act); exit(EXIT_FAILURE); } break; }; /* the only action that has an option without dash-letter prefix */ if (com.action == ACT_DUMP) { if (argc < 4) exit(EXIT_FAILURE); optionarg = argv[i++]; com.dump_path = strdup(optionarg); } for (; i < argc; ) { p = argv[i]; if ((p[0] != '-') || (strlen(p) != 2)) { log_tool("unknown option %s", p); log_tool("space required before option value"); exit(EXIT_FAILURE); } optchar = p[1]; i++; /* the only option that does not have optionarg */ if (optchar == 'D') { com.debug = 1; log_stderr_priority = LOG_DEBUG; continue; } if (i >= argc) { log_tool("option '%c' requires arg", optchar); exit(EXIT_FAILURE); } optionarg = argv[i]; switch (optchar) { case 'Q': com.quiet_fail = atoi(optionarg); break; case 'R': com.debug_renew = atoi(optionarg); break; case 'L': log_logfile_priority = atoi(optionarg); break; case 'S': log_syslog_priority = atoi(optionarg); break; case 'a': com.aio_arg = atoi(optionarg); if (com.aio_arg && com.aio_arg != 1) com.aio_arg = 1; break; case 't': com.max_worker_threads = atoi(optionarg); if (com.max_worker_threads < DEFAULT_MIN_WORKER_THREADS) com.max_worker_threads = DEFAULT_MIN_WORKER_THREADS; break; case 'w': com.use_watchdog = atoi(optionarg); break; case 'h': com.high_priority = atoi(optionarg); break; case 'o': if (com.action == ACT_STATUS) { com.sort_arg = *optionarg; } else { com.io_timeout_arg = atoi(optionarg); if (!com.io_timeout_arg) com.io_timeout_arg = DEFAULT_IO_TIMEOUT; } break; case 'n': com.num_hosts = atoi(optionarg); break; case 'm': com.max_hosts = atoi(optionarg); break; case 'p': com.pid = atoi(optionarg); break; case 'e': strncpy(com.our_host_name, optionarg, NAME_ID_SIZE); break; case 'i': com.local_host_id = atoll(optionarg); break; case 'g': com.local_host_generation = atoll(optionarg); break; case 'f': com.force_mode = strtoul(optionarg, NULL, 0); break; case 's': parse_arg_lockspace(optionarg); /* com.lockspace */ break; case 'r': parse_arg_resource(optionarg); /* com.res_args[] */ break; case 'U': com.uid = user_to_uid(optionarg); break; case 'G': com.gid = group_to_gid(optionarg); break; case 'c': begin_command = 1; break; default: log_tool("unknown option: %c", optchar); exit(EXIT_FAILURE); }; if (begin_command) break; i++; } /* * the remaining args are for the command * * sanlock -r foo -n 2 -d bar:0 -c /bin/cmd -X -Y -Z * argc = 12 * loop above breaks with i = 8, argv[8] = "/bin/cmd" * * cmd_argc = 4 = argc (12) - i (8) * cmd_argv[0] = "/bin/cmd" * cmd_argv[1] = "-X" * cmd_argv[2] = "-Y" * cmd_argv[3] = "-Z" * cmd_argv[4] = NULL (required by execv) */ if (begin_command) { cmd_argc = argc - i; if (cmd_argc < 1) { log_tool("command option (-c) requires an arg"); return -EINVAL; } len = (cmd_argc + 1) * sizeof(char *); /* +1 for final NULL */ cmd_argv = malloc(len); if (!cmd_argv) return -ENOMEM; memset(cmd_argv, 0, len); for (j = 0; j < cmd_argc; j++) { cmd_argv[j] = strdup(argv[i++]); if (!cmd_argv[j]) return -ENOMEM; } strncpy(command, cmd_argv[0], COMMAND_MAX - 1); } return 0; } static int do_client(void) { struct sanlk_resource **res_args = NULL; struct sanlk_resource *res; char *res_state = NULL; int i, fd, rv = 0; if (com.action == ACT_COMMAND || com.action == ACT_ACQUIRE) { if (com.num_hosts) { for (i = 0; i < com.res_count; i++) { res = com.res_args[i]; res->flags |= SANLK_RES_NUM_HOSTS; res->data32 = com.num_hosts; } } } switch (com.action) { case ACT_STATUS: rv = sanlock_status(com.debug, com.sort_arg); break; case ACT_HOST_STATUS: rv = sanlock_host_status(com.debug, com.lockspace.name); break; case ACT_LOG_DUMP: rv = sanlock_log_dump(LOG_DUMP_SIZE); break; case ACT_SHUTDOWN: log_tool("shutdown"); rv = sanlock_shutdown(com.force_mode); log_tool("shutdown done %d", rv); break; case ACT_COMMAND: log_tool("register"); fd = sanlock_register(); log_tool("register done %d", fd); if (fd < 0) goto out; log_tool("acquire fd %d", fd); rv = sanlock_acquire(fd, -1, 0, com.res_count, com.res_args, NULL); log_tool("acquire done %d", rv); if (rv < 0) goto out; if (!command[0]) { while (1) sleep(10); } execv(command, cmd_argv); perror("execv failed"); /* release happens automatically when pid exits and daemon detects POLLHUP on registered connection */ break; case ACT_ADD_LOCKSPACE: log_tool("add_lockspace"); rv = sanlock_add_lockspace(&com.lockspace, 0); log_tool("add_lockspace done %d", rv); break; case ACT_INQ_LOCKSPACE: log_tool("inq_lockspace"); rv = sanlock_inq_lockspace(&com.lockspace, 0); log_tool("inq_lockspace done %d", rv); break; case ACT_REM_LOCKSPACE: log_tool("rem_lockspace"); rv = sanlock_rem_lockspace(&com.lockspace, 0); log_tool("rem_lockspace done %d", rv); break; case ACT_ACQUIRE: log_tool("acquire pid %d", com.pid); rv = sanlock_acquire(-1, com.pid, 0, com.res_count, com.res_args, NULL); log_tool("acquire done %d", rv); break; case ACT_RELEASE: log_tool("release pid %d", com.pid); rv = sanlock_release(-1, com.pid, 0, com.res_count, com.res_args); log_tool("release done %d", rv); break; case ACT_INQUIRE: log_tool("inquire pid %d", com.pid); rv = sanlock_inquire(-1, com.pid, 0, &com.res_count, &res_state); log_tool("inquire done %d res_count %d", rv, com.res_count); if (rv < 0) break; log_tool("\"%s\"", res_state); if (!com.debug) break; com.res_count = 0; rv = sanlock_state_to_args(res_state, &com.res_count, &res_args); log_tool("\nstate_to_args done %d res_count %d", rv, com.res_count); if (rv < 0) break; free(res_state); res_state = NULL; for (i = 0; i < com.res_count; i++) { res = res_args[i]; log_tool("\"%s:%s:%s:%llu:%llu\"", res->lockspace_name, res->name, res->disks[0].path, (unsigned long long)res->disks[0].offset, (unsigned long long)res->lver); } rv = sanlock_args_to_state(com.res_count, res_args, &res_state); log_tool("\nargs_to_state done %d", rv); if (rv < 0) break; log_tool("\"%s\"", res_state); break; case ACT_REQUEST: log_tool("request"); rv = sanlock_request(0, com.force_mode, com.res_args[0]); log_tool("request done %d", rv); break; case ACT_EXAMINE: log_tool("examine"); if (com.lockspace.host_id_disk.path[0]) rv = sanlock_examine(0, &com.lockspace, NULL); else rv = sanlock_examine(0, NULL, com.res_args[0]); log_tool("examine done %d", rv); break; case ACT_CLIENT_ALIGN: log_tool("align"); rv = sanlock_align(&com.lockspace.host_id_disk); log_tool("align done %d", rv); break; case ACT_CLIENT_INIT: log_tool("init"); if (com.lockspace.host_id_disk.path[0]) rv = sanlock_init(&com.lockspace, NULL, com.max_hosts, com.num_hosts); else rv = sanlock_init(NULL, com.res_args[0], com.max_hosts, com.num_hosts); log_tool("init done %d", rv); break; default: log_tool("action not implemented"); rv = -1; } out: return rv; } static int do_direct(void) { struct leader_record leader; uint64_t timestamp, owner_id, owner_generation; int live; int rv; setup_task_timeouts(&main_task, com.io_timeout_arg); setup_task_aio(&main_task, com.aio_arg, DIRECT_AIO_CB_SIZE); sprintf(main_task.name, "%s", "main_direct"); switch (com.action) { case ACT_DIRECT_INIT: rv = direct_init(&main_task, &com.lockspace, com.res_args[0], com.max_hosts, com.num_hosts); log_tool("init done %d", rv); break; case ACT_DUMP: rv = direct_dump(&main_task, com.dump_path, com.force_mode); break; case ACT_READ_LEADER: rv = direct_read_leader(&main_task, &com.lockspace, com.res_args[0], &leader); log_tool("read_leader done %d", rv); log_tool("magic 0x%0x", leader.magic); log_tool("version 0x%x", leader.version); log_tool("flags 0x%x", leader.flags); log_tool("sector_size %u", leader.sector_size); log_tool("num_hosts %llu", (unsigned long long)leader.num_hosts); log_tool("max_hosts %llu", (unsigned long long)leader.max_hosts); log_tool("owner_id %llu", (unsigned long long)leader.owner_id); log_tool("owner_generation %llu", (unsigned long long)leader.owner_generation); log_tool("lver %llu", (unsigned long long)leader.lver); log_tool("space_name %.48s", leader.space_name); log_tool("resource_name %.48s", leader.resource_name); log_tool("timestamp %llu", (unsigned long long)leader.timestamp); log_tool("checksum 0x%0x", leader.checksum); log_tool("write_id %llu", (unsigned long long)leader.write_id); log_tool("write_generation %llu", (unsigned long long)leader.write_generation); log_tool("write_timestamp %llu", (unsigned long long)leader.write_timestamp); break; case ACT_ACQUIRE: rv = direct_acquire(&main_task, com.res_args[0], com.num_hosts, com.local_host_id, com.local_host_generation, &leader); log_tool("acquire done %d", rv); break; case ACT_RELEASE: rv = direct_release(&main_task, com.res_args[0], &leader); log_tool("release done %d", rv); break; case ACT_ACQUIRE_ID: setup_host_name(); rv = direct_acquire_id(&main_task, &com.lockspace, our_host_name_global); log_tool("acquire_id done %d", rv); break; case ACT_RELEASE_ID: rv = direct_release_id(&main_task, &com.lockspace); log_tool("release_id done %d", rv); break; case ACT_RENEW_ID: rv = direct_renew_id(&main_task, &com.lockspace); log_tool("rewew_id done %d", rv); break; case ACT_READ_ID: rv = direct_read_id(&main_task, &com.lockspace, ×tamp, &owner_id, &owner_generation); log_tool("read_id done %d timestamp %llu owner_id %llu owner_generation %llu", rv, (unsigned long long)timestamp, (unsigned long long)owner_id, (unsigned long long)owner_generation); break; case ACT_LIVE_ID: rv = direct_live_id(&main_task, &com.lockspace, ×tamp, &owner_id, &owner_generation, &live); log_tool("live_id done %d live %d timestamp %llu owner_id %llu owner_generation %llu", rv, live, (unsigned long long)timestamp, (unsigned long long)owner_id, (unsigned long long)owner_generation); break; default: log_tool("direct action %d not known", com.action); rv = -1; } close_task_aio(&main_task); return rv; } int main(int argc, char *argv[]) { int rv; BUILD_BUG_ON(sizeof(struct sanlk_disk) != sizeof(struct sync_disk)); BUILD_BUG_ON(sizeof(struct leader_record) > LEADER_RECORD_MAX); /* initialize global variables */ pthread_mutex_init(&spaces_mutex, NULL); INIT_LIST_HEAD(&spaces); INIT_LIST_HEAD(&spaces_rem); INIT_LIST_HEAD(&spaces_add); memset(&com, 0, sizeof(com)); com.use_watchdog = DEFAULT_USE_WATCHDOG; com.high_priority = DEFAULT_HIGH_PRIORITY; com.max_worker_threads = DEFAULT_MAX_WORKER_THREADS; com.io_timeout_arg = DEFAULT_IO_TIMEOUT; com.aio_arg = DEFAULT_USE_AIO; com.uid = DEFAULT_SOCKET_UID; com.gid = DEFAULT_SOCKET_GID; com.pid = -1; com.sh_retries = DEFAULT_SH_RETRIES; memset(&main_task, 0, sizeof(main_task)); rv = read_command_line(argc, argv); if (rv < 0) goto out; switch (com.type) { case COM_DAEMON: rv = do_daemon(); break; case COM_CLIENT: rv = do_client(); break; case COM_DIRECT: rv = do_direct(); break; }; out: return rv; } sanlock-2.2/src/delta_lease.h0000644000175100017510000000317511751766670015233 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DELTA_LEASE_H__ #define __DELTA_LEASE_H__ int delta_lease_leader_read(struct task *task, struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *leader_ret, const char *caller); int delta_lease_acquire(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *our_host_name, uint64_t host_id, struct leader_record *leader_ret); int delta_lease_renew(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *bitmap, int prev_result, int *read_result, struct leader_record *leader_last, struct leader_record *leader_ret); int delta_lease_release(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name GNUC_UNUSED, struct leader_record *leader_last, struct leader_record *leader_ret); int delta_lease_init(struct task *task, struct sync_disk *disk, char *space_name, int max_hosts); #endif sanlock-2.2/src/logrotate.sanlock0000644000175100017510000000025711751766670016172 0ustar weberweber/var/log/sanlock.log { rotate 3 missingok copytruncate size 10M compress compresscmd /usr/bin/xz uncompresscmd /usr/bin/unxz compressext .xz } sanlock-2.2/src/sanlock_rv.h0000644000175100017510000000337011751766670015127 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_RV_H__ #define __SANLOCK_RV_H__ #define SANLK_OK 1 #define SANLK_NONE 0 /* unused */ #define SANLK_ERROR -201 #define SANLK_AIO_TIMEOUT -202 /* run_ballot */ #define SANLK_DBLOCK_READ -210 #define SANLK_DBLOCK_WRITE -211 #define SANLK_DBLOCK_LVER -212 #define SANLK_DBLOCK_MBAL -213 #define SANLK_DBLOCK_CHECKSUM -214 /* verify_leader, leader_read, leader_write (paxos or delta) (when adding to list, check if it should be a corrupt_result()) */ #define SANLK_LEADER_READ -220 #define SANLK_LEADER_WRITE -221 #define SANLK_LEADER_DIFF -222 #define SANLK_LEADER_MAGIC -223 #define SANLK_LEADER_VERSION -224 #define SANLK_LEADER_SECTORSIZE -225 #define SANLK_LEADER_LOCKSPACE -226 #define SANLK_LEADER_RESOURCE -227 #define SANLK_LEADER_NUMHOSTS -228 #define SANLK_LEADER_CHECKSUM -229 /* paxos_lease_acquire, paxos_lease_release */ #define SANLK_ACQUIRE_LVER -240 #define SANLK_ACQUIRE_LOCKSPACE -241 #define SANLK_ACQUIRE_IDDISK -242 #define SANLK_ACQUIRE_IDLIVE -243 #define SANLK_ACQUIRE_OWNED -244 #define SANLK_ACQUIRE_OTHER -245 #define SANLK_ACQUIRE_SHRETRY -246 #define SANLK_RELEASE_LVER -250 #define SANLK_RELEASE_OWNER -251 /* delta_lease_renew, delta_lease_acquire */ #define SANLK_RENEW_OWNER -260 #define SANLK_RENEW_DIFF -261 #define SANLK_HOSTID_BUSY -262 /* request_token */ #define SANLK_REQUEST_MAGIC -270 #define SANLK_REQUEST_VERSION -271 #define SANLK_REQUEST_OLD -272 #define SANLK_REQUEST_LVER -273 #endif sanlock-2.2/src/direct.h0000644000175100017510000000356711751766670014250 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DIRECT_H__ #define __DIRECT_H__ int direct_acquire(struct task *task, struct sanlk_resource *res, int num_hosts, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_ret); int direct_release(struct task *task, struct sanlk_resource *res, struct leader_record *leader_ret); int direct_acquire_id(struct task *task, struct sanlk_lockspace *ls, char *our_host_name); int direct_release_id(struct task *task, struct sanlk_lockspace *ls); int direct_renew_id(struct task *task, struct sanlk_lockspace *ls); int direct_read_id(struct task *task, struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation); int direct_live_id(struct task *task, struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int *live); int direct_align(struct sync_disk *disk); int direct_init(struct task *task, struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts); int direct_read_leader(struct task *task, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader_ret); int direct_dump(struct task *task, char *dump_path, int force_mode); #endif sanlock-2.2/src/sanlock_sock.h0000644000175100017510000000325011751766670015434 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_SOCK_H__ #define __SANLOCK_SOCK_H__ #define SANLK_RUN_DIR "/var/run/sanlock" #define SANLK_SOCKET_NAME "sanlock.sock" #define SM_MAGIC 0x04282010 #define MAX_CLIENT_MSG (1024 * 1024) /* TODO: this is random */ enum { SM_CMD_REGISTER = 1, SM_CMD_ADD_LOCKSPACE = 2, SM_CMD_REM_LOCKSPACE = 3, SM_CMD_SHUTDOWN = 4, SM_CMD_STATUS = 5, SM_CMD_LOG_DUMP = 6, SM_CMD_ACQUIRE = 7, SM_CMD_RELEASE = 8, SM_CMD_INQUIRE = 9, SM_CMD_RESTRICT = 10, SM_CMD_REQUEST = 11, SM_CMD_ALIGN = 12, SM_CMD_INIT_LOCKSPACE = 13, SM_CMD_INIT_RESOURCE = 14, SM_CMD_EXAMINE_LOCKSPACE = 15, SM_CMD_EXAMINE_RESOURCE = 16, SM_CMD_HOST_STATUS = 17, SM_CMD_INQ_LOCKSPACE = 18, }; struct sm_header { uint32_t magic; uint32_t version; uint32_t cmd; /* SM_CMD_ */ uint32_t cmd_flags; uint32_t length; uint32_t seq; uint32_t data; uint32_t data2; }; #define SANLK_STATE_MAXSTR 4096 #define SANLK_STATE_DAEMON 1 #define SANLK_STATE_CLIENT 2 #define SANLK_STATE_LOCKSPACE 3 #define SANLK_STATE_RESOURCE 4 #define SANLK_STATE_HOST 5 struct sanlk_state { uint32_t type; /* SANLK_STATE_ */ uint32_t flags; uint32_t data32; /* pid (for client) */ uint64_t data64; char name[SANLK_NAME_LEN]; /* client name or resource name */ uint32_t str_len; char str[0]; /* string of internal state */ }; int sanlock_socket_address(struct sockaddr_un *addr); #endif sanlock-2.2/src/diskio.c0000644000175100017510000004417311751766670014251 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* linux aio */ #include /* posix aio */ #include "sanlock_internal.h" #include "diskio.h" #include "direct.h" #include "log.h" static int set_disk_properties(struct sync_disk *disk) { blkid_probe probe; blkid_topology topo; uint32_t sector_size, ss_logical, ss_physical; probe = blkid_new_probe_from_filename(disk->path); if (!probe) { log_error("cannot get blkid probe %s", disk->path); return -1; } topo = blkid_probe_get_topology(probe); if (!topo) { log_error("cannot get blkid topology %s", disk->path); blkid_free_probe(probe); return -1; } sector_size = blkid_probe_get_sectorsize(probe); ss_logical = blkid_topology_get_logical_sector_size(topo); ss_physical = blkid_topology_get_physical_sector_size(topo); blkid_free_probe(probe); if ((sector_size != ss_logical) || (sector_size % 512)) { log_error("invalid disk sector size %u logical %u " "physical %u %s", sector_size, ss_logical, ss_physical, disk->path); return -1; } disk->sector_size = sector_size; return 0; } void close_disks(struct sync_disk *disks, int num_disks) { int d; for (d = 0; d < num_disks; d++) { if (disks[d].fd == -1) continue; close(disks[d].fd); disks[d].fd = -1; } } int majority_disks(int num_disks, int num) { if (num_disks == 1 && !num) return 0; /* odd number of disks */ if (num_disks % 2) return num >= ((num_disks / 2) + 1); /* even number of disks */ if (num > (num_disks / 2)) return 1; if (num < (num_disks / 2)) return 0; /* TODO: half of disks are majority if tiebreaker disk is present */ return 0; } /* * set fd in each disk * returns 0 if majority of disks were opened successfully, -EXXX otherwise */ int open_disks_fd(struct sync_disk *disks, int num_disks) { struct sync_disk *disk; int num_opens = 0; int d, fd, rv = -1; for (d = 0; d < num_disks; d++) { disk = &disks[d]; if (disk->fd != -1) { log_error("open fd %d exists %s", disk->fd, disk->path); rv = -1; goto fail; } fd = open(disk->path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { rv = -errno; log_error("open error %d %s", fd, disk->path); continue; } disk->fd = fd; num_opens++; } if (!majority_disks(num_disks, num_opens)) { /* rv is open errno */ goto fail; } return 0; fail: close_disks(disks, num_disks); return rv; } /* * set fd and sector_size * verify offset is correctly aligned * returns 0 for success or -EXXX */ int open_disk(struct sync_disk *disk) { struct stat st; int align_size; int fd, rv; fd = open(disk->path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { rv = -errno; log_error("open error %d %s", rv, disk->path); goto fail; } if (fstat(fd, &st) < 0) { rv = -errno; log_error("fstat error %d %s", rv, disk->path); close(fd); goto fail; } if (S_ISREG(st.st_mode)) { disk->sector_size = 512; } else { rv = set_disk_properties(disk); if (rv < 0) { close(fd); goto fail; } } align_size = direct_align(disk); if (align_size < 0) { rv = align_size; close(fd); goto fail; } if (disk->offset % align_size) { rv = -EBADSLT; log_error("invalid offset %llu align size %u %s", (unsigned long long)disk->offset, align_size, disk->path); close(fd); goto fail; } disk->fd = fd; return 0; fail: if (rv >= 0) rv = -1; return rv; } /* * set fd and sector_size in each disk * verify all sector_size's match * returns 0 if majority of disks were opened successfully, -EXXX otherwise */ int open_disks(struct sync_disk *disks, int num_disks) { struct sync_disk *disk; int num_opens = 0; int d, err, rv = -1; uint32_t ss = 0; for (d = 0; d < num_disks; d++) { disk = &disks[d]; if (disk->fd != -1) { log_error("open fd %d exists %s", disk->fd, disk->path); rv = -ENOTEMPTY; goto fail; } err = open_disk(disk); if (err < 0) { rv = err; continue; } if (!ss) { ss = disk->sector_size; } else if (ss != disk->sector_size) { log_error("inconsistent sector sizes %u %u %s", ss, disk->sector_size, disk->path); goto fail; } num_opens++; } if (!majority_disks(num_disks, num_opens)) { /* rv is from open err */ goto fail; } return 0; fail: close_disks(disks, num_disks); return rv; } static int do_write(int fd, uint64_t offset, const char *buf, int len, struct task *task) { off_t ret; int rv; int pos = 0; if (task) task->io_count++; ret = lseek(fd, offset, SEEK_SET); if (ret != offset) return -1; retry: rv = write(fd, buf + pos, len); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) return -1; /* if (rv != len && len == sector_size) return error? partial sector writes should not happen AFAIK, and some uses depend on atomic single sector writes */ if (rv != len) { len -= rv; pos += rv; goto retry; } return 0; } static int do_read(int fd, uint64_t offset, char *buf, int len, struct task *task) { off_t ret; int rv, pos = 0; if (task) task->io_count++; ret = lseek(fd, offset, SEEK_SET); if (ret != offset) return -1; while (pos < len) { rv = read(fd, buf + pos, len - pos); if (rv == 0) return -1; if (rv == -1 && errno == EINTR) continue; if (rv < 0) return -1; pos += rv; } return 0; } static struct aicb *find_callback_slot(struct task *task) { struct timespec ts; struct io_event event; int cleared = 0; int rv; int i; find: for (i = 0; i < task->cb_size; i++) { if (task->callbacks[i].used) continue; return &task->callbacks[i]; } if (cleared++) return NULL; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = task->io_timeout_seconds; retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) return NULL; if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); log_taske(task, "aio collect %p:%p:%p result %ld:%ld old free", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); ev_aicb->used = 0; free(ev_aicb->buf); ev_aicb->buf = NULL; goto find; } return NULL; } /* * If this function returns SANLK_AIO_TIMEOUT, it means the io has timed out * and the event for the timed out io has not been reaped; the caller cannot * free the buf it passed in. It will be freed by a subsequent call when the * event is reaped. (Using my own error value here because I'm not certain * what values we might return from event.res.) */ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct task *task, int cmd) { struct timespec ts; struct aicb *aicb; struct iocb *iocb; struct io_event event; int rv; /* I expect this pre-emptively catches the io_submit EAGAIN case */ aicb = find_callback_slot(task); if (!aicb) return -ENOENT; iocb = &aicb->iocb; memset(iocb, 0, sizeof(struct iocb)); iocb->aio_fildes = fd; iocb->aio_lio_opcode = cmd; iocb->u.c.buf = buf; iocb->u.c.nbytes = len; iocb->u.c.offset = offset; rv = io_submit(task->aio_ctx, 1, &iocb); if (rv < 0) { log_taske(task, "aio submit %p:%p:%p rv %d fd %d cmd %d", aicb, iocb, buf, rv, fd, cmd); goto out; } task->io_count++; /* don't reuse aicb->iocb or free the buf until we reap the event */ aicb->used = 1; aicb->buf = buf; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = task->io_timeout_seconds; retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) { log_taske(task, "aio getevent %p:%p:%p rv %d", aicb, iocb, buf, rv); goto out; } if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); ev_aicb->used = 0; if (ev_iocb != iocb) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld other free", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld match res", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != len) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld match len %d", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, len); rv = -EMSGSIZE; goto out; } /* standard success case */ rv = 0; goto out; } /* Timed out waiting for result. If cancel fails, we could try retry io_getevents indefinately, but that removes the whole point of using aio, which is the timeout. So, we need to be prepared to reap the event the next time we call io_getevents for a different i/o. We can't reuse the iocb for this timed out io until we get an event for it because we need to compare the iocb to event.obj to distinguish events for separate submissions. dct: io_cancel doesn't work, in general. you are very likely going to get -EINVAL from that call */ task->to_count++; log_taske(task, "aio timeout %p:%p:%p sec %d to_count %d", aicb, iocb, buf, task->io_timeout_seconds, task->to_count); rv = io_cancel(task->aio_ctx, iocb, &event); if (!rv) { aicb->used = 0; rv = -ECANCELED; } else { /* aicb->used and aicb->buf both remain set */ rv = SANLK_AIO_TIMEOUT; if (cmd == IO_CMD_PREAD) task->read_iobuf_timeout_aicb = aicb; } out: return rv; } static int do_write_aio_linux(int fd, uint64_t offset, char *buf, int len, struct task *task) { return do_linux_aio(fd, offset, buf, len, task, IO_CMD_PWRITE); } static int do_read_aio_linux(int fd, uint64_t offset, char *buf, int len, struct task *task) { return do_linux_aio(fd, offset, buf, len, task, IO_CMD_PREAD); } static int do_write_aio_posix(int fd, uint64_t offset, char *buf, int len, struct task *task) { struct timespec ts; struct aiocb cb; struct aiocb const *p_cb; int rv; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = task->io_timeout_seconds; memset(&cb, 0, sizeof(struct aiocb)); p_cb = &cb; cb.aio_fildes = fd; cb.aio_buf = buf; cb.aio_nbytes = len; cb.aio_offset = offset; rv = aio_write(&cb); if (rv < 0) return -1; rv = aio_suspend(&p_cb, 1, &ts); if (!rv) return 0; /* the write timed out, try to cancel it... */ rv = aio_cancel(fd, &cb); if (rv < 0) return -1; if (rv == AIO_ALLDONE) return 0; if (rv == AIO_CANCELED) return -EIO; /* Functions that depend on the timeout might consider * the action failed even if it will complete if that * happened after the alloted time frame */ if (rv == AIO_NOTCANCELED) return -EIO; /* undefined error condition */ return -1; } static int do_read_aio_posix(int fd, uint64_t offset, char *buf, int len, struct task *task) { struct timespec ts; struct aiocb cb; struct aiocb const *p_cb; int rv; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = task->io_timeout_seconds; memset(&cb, 0, sizeof(struct aiocb)); p_cb = &cb; cb.aio_fildes = fd; cb.aio_buf = buf; cb.aio_nbytes = len; cb.aio_offset = offset; rv = aio_read(&cb); if (rv < 0) return -1; rv = aio_suspend(&p_cb, 1, &ts); if (!rv) return 0; /* the read timed out, try to cancel it... */ rv = aio_cancel(fd, &cb); if (rv < 0) return -1; if (rv == AIO_ALLDONE) return 0; if (rv == AIO_CANCELED) return -EIO; if (rv == AIO_NOTCANCELED) /* Functions that depend on the timeout might consider * the action failed even if it will complete if that * happened apter the alloted time frame */ return -EIO; /* undefined error condition */ return -1; } /* write aligned io buffer */ int write_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task) { if (task && task->use_aio == 1) return do_write_aio_linux(fd, offset, iobuf, iobuf_len, task); else if (task && task->use_aio == 2) return do_write_aio_posix(fd, offset, iobuf, iobuf_len, task); else return do_write(fd, offset, iobuf, iobuf_len, task); } static int _write_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint32_t sector_count GNUC_UNUSED, const char *data, int data_len, int iobuf_len, struct task *task, const char *blktype) { char *iobuf, **p_iobuf; uint64_t offset; int rv; if (!disk->sector_size) return -EINVAL; offset = disk->offset + (sector_nr * disk->sector_size); p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_error("write_sectors %s posix_memalign rv %d %s", blktype, rv, disk->path); rv = -ENOMEM; goto out; } memset(iobuf, 0, iobuf_len); memcpy(iobuf, data, data_len); rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task); if (rv < 0) { log_error("write_sectors %s offset %llu rv %d %s", blktype, (unsigned long long)offset, rv, disk->path); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); out: return rv; } /* sector_nr is logical sector number within the sync_disk. the sync_disk itself begins at disk->offset (in bytes) from the start of the block device identified by disk->path, data_len must be <= sector_size */ int write_sector(const struct sync_disk *disk, uint64_t sector_nr, const char *data, int data_len, struct task *task, const char *blktype) { int iobuf_len = disk->sector_size; if (data_len > iobuf_len) { log_error("write_sector %s data_len %d max %d %s", blktype, data_len, iobuf_len, disk->path); return -1; } return _write_sectors(disk, sector_nr, 1, data, data_len, iobuf_len, task, blktype); } /* write multiple complete sectors, data_len must be multiple of sector size */ int write_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, const char *blktype) { int iobuf_len = data_len; if (data_len != sector_count * disk->sector_size) { log_error("write_sectors %s data_len %d sector_count %d %s", blktype, data_len, sector_count, disk->path); return -1; } return _write_sectors(disk, sector_nr, sector_count, data, data_len, iobuf_len, task, blktype); } /* read aligned io buffer */ int read_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task) { if (task && task->use_aio == 1) return do_read_aio_linux(fd, offset, iobuf, iobuf_len, task); else if (task && task->use_aio == 2) return do_read_aio_posix(fd, offset, iobuf, iobuf_len, task); else return do_read(fd, offset, iobuf, iobuf_len, task); } /* read sector_count sectors starting with sector_nr, where sector_nr is a logical sector number within the sync_disk. the caller will generally want to look at the first N bytes of each sector. when reading multiple sectors, data_len will generally equal iobuf_len, but when reading one sector, data_len may be less than iobuf_len. */ int read_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, const char *blktype) { char *iobuf, **p_iobuf; uint64_t offset; int iobuf_len; int rv; if (!disk->sector_size) { log_error("read_sectors %s zero sector_size", blktype); return -EINVAL; } iobuf_len = sector_count * disk->sector_size; offset = disk->offset + (sector_nr * disk->sector_size); p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_error("read_sectors %s posix_memalign rv %d %s", blktype, rv, disk->path); rv = -ENOMEM; goto out; } memset(iobuf, 0, iobuf_len); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task); if (!rv) { memcpy(data, iobuf, data_len); } else { log_error("read_sectors %s offset %llu rv %d %s", blktype, (unsigned long long)offset, rv, disk->path); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); out: return rv; } /* Try to reap the event of a previously timed out read_iobuf. The aicb used in a task's last timed out read_iobuf is task->read_iobuf_timeout_aicb . */ int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task) { struct timespec ts; struct aicb *aicb; struct iocb *iocb; struct io_event event; int rv; aicb = task->read_iobuf_timeout_aicb; iocb = &aicb->iocb; if (!aicb->used) return -EINVAL; if (iocb->aio_fildes != fd) return -EINVAL; if (iocb->u.c.buf != iobuf) return -EINVAL; if (iocb->u.c.nbytes != iobuf_len) return -EINVAL; if (iocb->u.c.offset != offset) return -EINVAL; if (iocb->aio_lio_opcode != IO_CMD_PREAD) return -EINVAL; memset(&ts, 0, sizeof(struct timespec)); ts.tv_nsec = 500000000; /* half a second */ retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) { log_taske(task, "aio getevent %p:%p:%p rv %d r", aicb, iocb, iobuf, rv); goto out; } if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); ev_aicb->used = 0; if (ev_iocb != iocb) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld other free r", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld match res r", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != iobuf_len) { log_taske(task, "aio collect %p:%p:%p result %ld:%ld match len %d r", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, iobuf_len); rv = -EMSGSIZE; goto out; } log_taske(task, "aio collect %p:%p:%p result %ld:%ld match reap", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = 0; goto out; } /* timed out again */ rv = SANLK_AIO_TIMEOUT; out: return rv; } sanlock-2.2/src/sanlock_internal.h0000644000175100017510000004473311751766670016324 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SANLOCK_INTERNAL_H__ #define __SANLOCK_INTERNAL_H__ #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #ifndef EXTERN #define EXTERN extern #else #undef EXTERN #define EXTERN #endif #include "sanlock.h" #include "sanlock_rv.h" #include "sanlock_resource.h" #include "leader.h" #include "list.h" #include "monotime.h" #include /* default max number of hosts supported */ #define DEFAULT_MAX_HOSTS 2000 #define LOG_DUMP_SIZE (1024*1024) /* this is just the path to the executable, not full command line */ #define COMMAND_MAX 4096 #define SANLK_LOG_DIR "/var/log" #define SANLK_LOGFILE_NAME "sanlock.log" #define SANLK_LOCKFILE_NAME "sanlock.pid" #define DAEMON_NAME "sanlock" /* for paxos_lease sync_disk + offset: points to 1 leader_record + 1 request_record + MAX_HOSTS paxos_dblock's = 256 blocks = 128KB, ref: lease_item_record */ /* must mirror external sanlk_disk */ struct sync_disk { char path[SANLK_PATH_LEN]; uint64_t offset; uint32_t sector_size; /* sanlk_disk pad1 */ int fd; /* sanlk_disk pad2 */ }; /* * There are two different wrappers around a sanlk_resource: * 'struct token' keeps track of resources per-client, client.tokens[] * 'struct resource' keeps track of resources globally, resources list */ #define T_RESTRICT_SIGKILL 0x00000001 /* inherited from client->restrict */ #define T_LS_DEAD 0x00000002 /* don't bother trying to release if ls is dead */ struct token { /* values copied from acquire res arg */ uint64_t acquire_lver; uint64_t acquire_data64; uint32_t acquire_data32; uint32_t acquire_flags; /* copied from the sp with r.lockspace_name */ uint64_t host_id; uint64_t host_generation; /* internal */ struct list_head list; /* resource->tokens */ struct resource *resource; int pid; uint32_t flags; uint32_t token_id; /* used to refer to this token instance in log messages */ int shared_count; char shared_bitmap[HOSTID_BITMAP_SIZE]; /* bit set for host_id with SH */ struct sync_disk *disks; /* shorthand, points to r.disks[0] */ struct sanlk_resource r; }; #define R_SHARED 0x00000001 #define R_THREAD_EXAMINE 0x00000002 #define R_THREAD_RELEASE 0x00000004 #define R_RESTRICT_SIGKILL 0x00000008 /* inherited from token */ struct resource { struct list_head list; struct list_head tokens; /* only one token when ex, multiple sh */ uint64_t host_id; uint64_t host_generation; int pid; /* copied from token when ex */ uint32_t flags; uint32_t release_token_id; /* copy to temp token (tt) for log messages */ struct leader_record leader; /* copy of last leader_record we wrote */ struct sanlk_resource r; }; struct lease_status { int corrupt_result; int acquire_last_result; int renewal_last_result; uint64_t acquire_last_attempt; uint64_t acquire_last_success; uint64_t renewal_last_attempt; uint64_t renewal_last_success; uint32_t renewal_read_count; uint32_t renewal_read_check; char *renewal_read_buf; }; struct host_status { uint64_t first_check; /* local monotime */ uint64_t last_check; /* local monotime */ uint64_t last_live; /* local monotime */ uint64_t last_req; /* local monotime */ uint64_t owner_id; uint64_t owner_generation; uint64_t timestamp; /* remote monotime */ uint64_t set_bit_time; }; struct space { struct list_head list; char space_name[NAME_ID_SIZE]; uint32_t space_id; /* used to refer to this space instance in log messages */ uint64_t host_id; uint64_t host_generation; struct sync_disk host_id_disk; int align_size; int space_dead; int killing_pids; int external_remove; int thread_stop; int wd_fd; pthread_t thread; pthread_mutex_t mutex; /* protects lease_status, thread_stop */ struct lease_status lease_status; struct host_status host_status[DEFAULT_MAX_HOSTS]; }; /* * Example of watchdog behavior when host_id renewals fail, assuming * that sanlock cannot successfully kill the pids it is supervising that * depend on the given host_id. * * * Using these values in the example * watchdog_fire_timeout = 60 (constant) * io_timeout_seconds = 2 (defined by us) * id_renewal_seconds = 10 (defined by us) * id_renewal_fail_seconds = 30 (defined by us) * host_dead_seconds = 90 (derived below) * * (FIXME: 2/10/30 is not a combination we'd actually create, * but the example still works) * * T time in seconds * * 0: sanlock renews host_id on disk * sanlock calls wdmd_test_live(0, 30) * wdmd test_client sees now 0 < expire 30 ok * wdmd /dev/watchdog keepalive * * 10: sanlock renews host_id on disk ok * sanlock calls wdmd_test_live(10, 40) * wdmd test_client sees now 10 < expire 30 or 40 ok * wdmd /dev/watchdog keepalive * * 20: sanlock fails to renew host_id on disk * sanlock does not call wdmd_test_live * wdmd test_client sees now 20 < expire 40 ok * wdmd /dev/watchdog keepalive * * 30: sanlock fails to renew host_id on disk * sanlock does not call wdmd_test_live * wdmd test_client sees now 30 < expire 40 ok * wdmd /dev/watchdog keepalive * * 40: sanlock fails to renew host_id on disk * sanlock does not call wdmd_test_live * wdmd test_client sees now 40 >= expire 40 fail * wdmd no keepalive * * . /dev/watchdog will fire at last keepalive + watchdog_fire_timeout = * T30 + 60 = T90 * . host_id will expire at * last disk renewal ok + id_renewal_fail_seconds + watchdog_fire_timeout * T10 + 30 + 60 = T100 * (aka last disk renewal ok + host_dead_seconds) * . the wdmd test at T30 could have been at T39, so wdmd would have * seen the client unexpired/ok just before the expiry time at T40, * which would lead to /dev/watchdog firing at 99 instead of 90 * * 50: sanlock fails to renew host_id on disk -> does not call wdmd_test_live * wdmd test_client sees now 50 > expire 40 fail -> no keepalive * 60: sanlock fails to renew host_id on disk -> does not call wdmd_test_live * wdmd test_client sees now 60 > expire 40 fail -> no keepalive * 70: sanlock fails to renew host_id on disk -> does not call wdmd_test_live * wdmd test_client sees now 70 > expire 40 fail -> no keepalive * 80: sanlock fails to renew host_id on disk -> does not call wdmd_test_live * wdmd test_client sees now 80 > expire 40 fail -> no keepalive * 90: sanlock fails to renew host_id on disk -> does not call wdmd_test_live * wdmd test_client sees now 90 > expire 40 fail -> no keepalive * /dev/watchdog fires, machine reset * 100: another host takes over leases held by host_id * * * A more likely recovery scenario when a host_id cannot be renewed * (probably caused by loss of storage connection): * * The sanlock daemon fails to renew its host_id from T20 to T40. * At T40, after failing to renew within id_renewal_fail_seconds (30), * the sanlock daemon begins trying to kill all pids that were using * leases under this host_id. As soon as all those pids exit, the sanlock * daemon will call wdmd_test_live(0, 0) to disable the wdmd testing for * this client/host_id. If it's able to call wdmd_test_live(0, 0) before T90, * the wdmd test will no longer see this client's expiry time of 40, * so the wdmd tests will succeed, wdmd will immediately go back to * /dev/watchdog keepalive's, and the machine will not be reset. * */ /* * "delta" refers to timed based leases described in Chockler/Malkhi that * we use for host_id ownership. * * "paxos" refers to disk paxos based leases described in Lamport that * we use for resource (vm) ownership. * * "free" refers to a lease (either type) that is not owned by anyone * * "held" refers to a lease (either type) that was owned by a host that * failed, so it was not released/freed. . (if a renewal fails we always attempt another renewal immediately) * * "max" refers to the maximum time that a successful acquire/renew can * take, assuming that every io operation takes the max allowable time * (io_timeout_seconds) * * "min" refers to the minimum time that a successful acquire/renew can * take, assuming that every io operation completes immediately, in * effectively zero time * * * io_timeout_seconds: defined by us * * id_renewal_seconds: defined by us * * id_renewal_fail_seconds: defined by us * * watchdog_fire_timeout: /dev/watchdog will fire without being petted this long * = 60 constant * * host_dead_seconds: the length of time from the last successful host_id * renewal until that host is killed by its watchdog. * = id_renewal_fail_seconds + watchdog_fire_timeout * * delta_large_delay: from the algorithm * = id_renewal_seconds + (6 * io_timeout_seconds) * * delta_short_delay: from the algorithm * = 2 * io_timeout_seconds * * delta_acquire_held_max: max time it can take to successfully * acquire a non-free delta lease * = io_timeout_seconds (read) + * max(delta_large_delay, host_dead_seconds) + * io_timeout_seconds (read) + * io_timeout_seconds (write) + * delta_short_delay + * io_timeout_seconds (read) * * delta_acquire_held_min: min time it can take to successfully * acquire a non-free delta lease * = max(delta_large_delay, host_dead_seconds) * * delta_acquire_free_max: max time it can take to successfully * acquire a free delta lease. * = io_timeout_seconds (read) + * io_timeout_seconds (write) + * delta_short_delay + * io_timeout_seconds (read) * * delta_acquire_free_min: min time it can take to successfully * acquire a free delta lease. * = delta_short_delay * * delta_renew_max: max time it can take to successfully * renew a delta lease. * = io_timeout_seconds (read) + * io_timeout_seconds (write) * * delta_renew_min: min time it can take to successfully * renew a delta lease. * = 0 * * paxos_acquire_held_max: max time it can take to successfully * acquire a non-free paxos lease, uncontended. * = io_timeout_seconds (read leader) + * host_dead_seconds + * io_timeout_seconds (read leader) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write leader) * * paxos_acquire_held_min: min time it can take to successfully * acquire a non-free paxos lease, uncontended. * = host_dead_seconds * * paxos_acquire_free_max: max time it can take to successfully * acquire a free paxos lease, uncontended. * = io_timeout_seconds (read leader) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write leader) * * paxos_acquire_free_min: min time it can take to successfully * acquire a free paxos lease, uncontended. * = 0 * * * How to configure the combination of related timeouts defined by us: * io_timeout_seconds * id_renewal_seconds * id_renewal_fail_seconds * * Here's one approach that seems to produce sensible sets of numbers: * * io_timeout_seconds = N * . max time one io can take * * delta_renew_max = 2N * . max time one renewal can take * * id_renewal_seconds = delta_renew_max (2N) * . delay this long after renewal success before next renew attempt begins * . this will be the difference between two successive renewal timestamps * when io times are effectively 0 * . there's no particular reason for it to be 2N exactly * . if a successful renewal takes the max possible time (delta_renew_max), * then the next renewal attempt will begin right away * . (if a renewal fails we always attempt another renewal immediately) * * id_renewal_fail_seconds = 4 * delta_renew_max (8N) * . time from last successful renewal until recovery begins * . allows for three consecutive max len renewal failures, i.e. * id_renewal_seconds + (3 * delta_renew_max) * * id_renewal_warn_seconds = 3 * delta_renew_max (6N) * . time from last successful renewal until warning about renewal length * . allows for two consecutive max len renewal failues * * T time in seconds * 0 renewal ok * 2N renewal attempt begin * 4N renewal attempt fail1 (each io takes max time) * 4N renewal attempt begin * 6N renewal attempt fail2 (each io takes max time) * 6N renewal attempt begin * 8N renewal attempt fail3 (each io takes max time) * 8N recovery begins (pids killed) * * If ios don't take the max len (delta_renew_max), this just * gives us more attempts to renew before recovery begins. * * io_timeout_seconds N 5 10 20 * id_renewal_seconds 2N 10 20 40 * id_renewal_fail_seconds 8N 40 80 160 * * 5 sec io timeout: fast storage io perf * 10 sec io timeout: normal storage io perf * 20 sec io timeout: slow storage io perf * * [We could break down these computations further by adding a variable * F = number of full len renewal failures allowed before recovery * begins. Above F is fixed at 3, but we may want to vary it to be * 2 or 4.] * * fast norm slow * watchdog_fire_timeout 60 60 60 * * io_timeout_seconds 5 10 20 * id_renewal_seconds 10 20 40 * id_renewal_fail_seconds 40 80 160 * id_renewal_warn_seconds 30 60 120 * * host_dead_seconds 100 140 220 * delta_large_delay 40 80 160 * delta_short_delay 10 20 40 * delta_acquire_held_max 130 200 340 * delta_acquire_held_min 100 140 220 * delta_acquire_free_max 25 50 100 * delta_acquire_free_min 10 20 40 * delta_renew_max 10 20 40 * delta_renew_min 0 0 0 * paxos_acquire_held_max 135 210 360 * paxos_acquire_held_min 100 140 220 * paxos_acquire_free_max 30 60 120 * paxos_acquire_free_min 0 0 0 */ /* * Why does delta_acquire use max(delta_large_delay, host_dead_seconds) instead * of just delta_large_delay as specified in the algorithm? * * 1. the time based lease algorithm uses delta_large_delay to determine that a * host is failed, but we want to be more certain the host is dead based on its * watchdog firing, and we know the watchdog has fired after host_dead_seconds. * * 2. if a delta lease can be acquired and released (freed) before * host_dead_seconds, that could allow the paxos leases of a failed host to be * acquired by someone else before host_dead_seconds (and before the failed * host is really dead), because acquiring a held paxos lease depends on the * delta lease of the failed owner not changing for host_dead_seconds. * We cannot allow a host to acquire another failed host's paxos lease before * host_dead_seconds. * * 3. ios can't be reliably canceled and never really time out; an io is only * really dead when the machine is dead/reset or storage access is cut off. * The delta lease algorithm expects real io timeouts. * * So, the delay is really meant to represent the time until we are certain a * host is safely gone and will no longer write, and for sanlock that means * until the watchdog has reset it. */ #define HOSTID_AIO_CB_SIZE 4 #define WORKER_AIO_CB_SIZE 2 #define DIRECT_AIO_CB_SIZE 1 #define RESOURCE_AIO_CB_SIZE 2 #define LIB_AIO_CB_SIZE 1 struct aicb { int used; char *buf; struct iocb iocb; }; struct task { char name[NAME_ID_SIZE+1]; /* for log messages */ int io_timeout_seconds; /* configured */ int id_renewal_seconds; /* configured */ int id_renewal_fail_seconds; /* configured */ int id_renewal_warn_seconds; /* configured */ int host_dead_seconds; /* calculated */ int request_finish_seconds; /* calculated */ int kill_count_term; /* constant */ int kill_count_max; /* constant */ unsigned int io_count; /* stats */ unsigned int to_count; /* stats */ int use_aio; int cb_size; char *iobuf; io_context_t aio_ctx; struct aicb *read_iobuf_timeout_aicb; struct aicb *callbacks; }; EXTERN struct task main_task; struct client { int used; int fd; /* unset is -1 */ int pid; /* unset is -1 */ int cmd_active; int cmd_last; int pid_dead; int suspend; int need_free; int kill_count; uint32_t restrict; uint64_t kill_last; char owner_name[SANLK_NAME_LEN+1]; pthread_mutex_t mutex; void *workfn; void *deadfn; struct token *tokens[SANLK_MAX_RESOURCES]; }; /* * client array is only touched by main_loop, there is no lock for it. * individual cl structs are accessed by worker threads using cl->mutex */ EXTERN struct client *client; #define WATCHDOG_FIRE_TIMEOUT 60 #define DEFAULT_USE_AIO 1 #define DEFAULT_IO_TIMEOUT 10 #define DEFAULT_USE_WATCHDOG 1 #define DEFAULT_HIGH_PRIORITY 1 #define DEFAULT_SOCKET_UID 0 #define DEFAULT_SOCKET_GID 0 #define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define DEFAULT_MIN_WORKER_THREADS 2 #define DEFAULT_MAX_WORKER_THREADS 8 #define DEFAULT_SH_RETRIES 8 struct command_line { int type; /* COM_ */ int action; /* ACT_ */ int debug; int debug_renew; int quiet_fail; int use_watchdog; int high_priority; int max_worker_threads; int aio_arg; int io_timeout_arg; int uid; /* -U */ int gid; /* -G */ int pid; /* -p */ char sort_arg; uint64_t local_host_id; /* -i */ uint64_t local_host_generation; /* -g */ int num_hosts; /* -n */ int max_hosts; /* -m */ int res_count; int sh_retries; uint32_t force_mode; char our_host_name[SANLK_NAME_LEN+1]; char *dump_path; struct sanlk_lockspace lockspace; /* -s LOCKSPACE */ struct sanlk_resource *res_args[SANLK_MAX_RESOURCES]; /* -r RESOURCE */ }; EXTERN struct command_line com; /* command line types and actions */ #define COM_DAEMON 1 #define COM_CLIENT 2 #define COM_DIRECT 3 enum { ACT_STATUS = 1, ACT_HOST_STATUS, ACT_LOG_DUMP, ACT_SHUTDOWN, ACT_ADD_LOCKSPACE, ACT_INQ_LOCKSPACE, ACT_REM_LOCKSPACE, ACT_COMMAND, ACT_ACQUIRE, ACT_RELEASE, ACT_INQUIRE, ACT_REQUEST, ACT_ACQUIRE_ID, ACT_RELEASE_ID, ACT_RENEW_ID, ACT_READ_ID, ACT_LIVE_ID, ACT_DIRECT_INIT, ACT_DUMP, ACT_READ_LEADER, ACT_CLIENT_INIT, ACT_CLIENT_ALIGN, ACT_EXAMINE, }; EXTERN int external_shutdown; EXTERN char our_host_name_global[SANLK_NAME_LEN+1]; EXTERN struct list_head spaces; EXTERN struct list_head spaces_rem; EXTERN struct list_head spaces_add; EXTERN pthread_mutex_t spaces_mutex; #endif sanlock-2.2/src/lockfile.h0000644000175100017510000000067111751766670014557 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LOCKFILE_H__ #define __LOCKFILE_H__ int lockfile(const char *dir, const char *name); void unlink_lockfile(int fd, const char *dir, const char *name); #endif sanlock-2.2/src/sanlock_direct.h0000644000175100017510000000271211751766670015751 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SANLOCK_DIRECT_H__ #define __SANLOCK_DIRECT_H__ /* * Use io_timeout_sec = 0 for default value */ int sanlock_direct_read_id(struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int use_aio, int io_timeout_sec); int sanlock_direct_live_id(struct sanlk_lockspace *ls, uint64_t *timestamp, uint64_t *owner_id, uint64_t *owner_generation, int *live, int use_aio, int io_timeout_sec); /* * Use max_hosts = 0 for default value. * Use num_hosts = 0 for default value. * Provide either lockspace or resource, not both */ int sanlock_direct_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts, int use_aio); /* * Returns the alignment in bytes required by sanlock_direct_init() * (1MB for disks with 512 sectors, 8MB for disks with 4096 sectors) */ int sanlock_direct_align(struct sanlk_disk *disk); #endif sanlock-2.2/src/diskio.h0000644000175100017510000000311011751766670014240 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DISKIO_H__ #define __DISKIO_H__ void close_disks(struct sync_disk *disks, int num_disks); int open_disk(struct sync_disk *disks); int open_disks(struct sync_disk *disks, int num_disks); int open_disks_fd(struct sync_disk *disks, int num_disks); int majority_disks(int num_disks, int num); /* * iobuf functions require the caller to allocate iobuf using posix_memalign * and pass it into the function */ int write_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task); int read_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task); int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task); /* * sector functions allocate an iobuf themselves, copy into it for read, use it * for io, copy out of it for write, and free it */ int write_sector(const struct sync_disk *disk, uint64_t sector_nr, const char *data, int data_len, struct task *task, const char *blktype); int write_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, const char *blktype); int read_sectors(const struct sync_disk *disk, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, const char *blktype); #endif sanlock-2.2/src/delta_lease.c0000644000175100017510000004157211751766670015231 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "direct.h" #include "log.h" #include "paxos_lease.h" #include "delta_lease.h" /* Based on "Light-Weight Leases for Storage-Centric Coordination" by Gregory Chockler and Dahlia Malkhi */ /* delta_leases are a series max_hosts leader_records, one leader per sector, host N's delta_lease is the leader_record in sectors N-1 */ /* * variable names: * rv: success is 0, failure is < 0 * error: success is 1 (SANLK_OK), failure is < 0 */ static void log_leader_error(int result, char *space_name, uint64_t host_id, struct sync_disk *disk, struct leader_record *lr, const char *caller) { log_error("leader1 %s error %d lockspace %.48s host_id %llu", caller ? caller : "unknown", result, space_name, (unsigned long long)host_id); log_error("leader2 path %s offset %llu", disk->path, (unsigned long long)disk->offset); log_error("leader3 m %x v %x ss %u nh %llu mh %llu oi %llu og %llu lv %llu", lr->magic, lr->version, lr->sector_size, (unsigned long long)lr->num_hosts, (unsigned long long)lr->max_hosts, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); log_error("leader4 sn %.48s rn %.48s ts %llu cs %x", lr->space_name, lr->resource_name, (unsigned long long)lr->timestamp, lr->checksum); } static int verify_leader(struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *lr, const char *caller) { uint32_t sum; int result; if (lr->magic != DELTA_DISK_MAGIC) { log_error("verify_leader %llu wrong magic %x %s", (unsigned long long)host_id, lr->magic, disk->path); result = SANLK_LEADER_MAGIC; goto fail; } if ((lr->version & 0xFFFF0000) != DELTA_DISK_VERSION_MAJOR) { log_error("verify_leader %llu wrong version %x %s", (unsigned long long)host_id, lr->version, disk->path); result = SANLK_LEADER_VERSION; goto fail; } if (lr->sector_size != disk->sector_size) { log_error("verify_leader %llu wrong sector size %d %d %s", (unsigned long long)host_id, lr->sector_size, disk->sector_size, disk->path); result = SANLK_LEADER_SECTORSIZE; goto fail; } if (strncmp(lr->space_name, space_name, NAME_ID_SIZE)) { log_error("verify_leader %llu wrong space name %.48s %.48s %s", (unsigned long long)host_id, lr->space_name, space_name, disk->path); result = SANLK_LEADER_LOCKSPACE; goto fail; } sum = leader_checksum(lr); if (lr->checksum != sum) { log_error("verify_leader %llu wrong checksum %x %x %s", (unsigned long long)host_id, lr->checksum, sum, disk->path); result = SANLK_LEADER_CHECKSUM; goto fail; } return SANLK_OK; fail: log_leader_error(result, space_name, host_id, disk, lr, caller); /* struct leader_record leader_rr; int rv; memset(&leader_rr, 0, sizeof(leader_rr)); rv = read_sectors(disk, host_id - 1, 1, (char *)&leader_rr, sizeof(struct leader_record), NULL, "delta_verify"); log_leader_error(rv, space_name, host_id, disk, &leader_rr, "delta_verify"); */ return result; } int delta_lease_leader_read(struct task *task, struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *leader_ret, const char *caller) { struct leader_record leader; int rv, error; /* host_id N is block offset N-1 */ memset(&leader, 0, sizeof(struct leader_record)); memset(leader_ret, 0, sizeof(struct leader_record)); rv = read_sectors(disk, host_id - 1, 1, (char *)&leader, sizeof(struct leader_record), task, "delta_leader"); if (rv < 0) return rv; error = verify_leader(disk, space_name, host_id, &leader, caller); memcpy(leader_ret, &leader, sizeof(struct leader_record)); return error; } /* * delta_lease_acquire: * set the owner of host_id to our_host_name. * * paxos_lease_acquire: * set the owner of resource_name to host_id. * * our_host_name is a unique host identifier used to detect when two different * hosts are trying to acquire the same host_id (since both will be using the * same host_id, that host_id won't work to distinguish between them.) We copy * our_host_name into leader.resource_name, so in a sense the owner_id and * resource_name fields of the leader_record switch functions: the common * resource is the ower_id, and the distinguishing id is the resource_name. */ int delta_lease_acquire(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *our_host_name, uint64_t host_id, struct leader_record *leader_ret) { struct leader_record leader; struct leader_record leader1; uint64_t new_ts; int i, error, rv, delay, delta_large_delay; log_space(sp, "delta_acquire %llu begin", (unsigned long long)host_id); error = delta_lease_leader_read(task, disk, space_name, host_id, &leader, "delta_acquire_begin"); if (error < 0) return error; if (leader.timestamp == LEASE_FREE) goto write_new; if (!strncmp(leader.resource_name, our_host_name, NAME_ID_SIZE)) { log_space(sp, "delta_acquire %llu fast reacquire", (unsigned long long)host_id); goto write_new; } /* we need to ensure that a host_id cannot be acquired and released * sooner than host_dead_seconds because the change in host_id * ownership affects the host_id "liveness" determination used by paxos * leases, and the ownership of paxos leases cannot change until after * host_dead_seconds to ensure that the watchdog has fired. So, I * think we want the delay here to be the max of host_dead_seconds and * the D+6d delay. * * Per the algorithm in the paper, a delta lease can change ownership * in the while loop below after the delta_delay of D+6d. However, * because we use the change of delta lease ownership to directly * determine the change in paxos lease ownership, we need the delta * delay to also meet the delay requirements of the paxos leases. The * paxos leases cannot change ownership until a min of * host_dead_seconds to ensure the watchdog has fired. So, the timeout * we use here must be the max of the delta delay (D+6d) and * host_dead_seconds */ delay = task->host_dead_seconds; delta_large_delay = task->id_renewal_seconds + (6 * task->io_timeout_seconds); if (delta_large_delay > delay) delay = delta_large_delay; while (1) { memcpy(&leader1, &leader, sizeof(struct leader_record)); log_space(sp, "delta_acquire %llu delta_large_delay %d delay %d", (unsigned long long)host_id, delta_large_delay, delay); /* TODO: we could reread every several seconds to see if it has changed, so we can abort more quickly if so */ for (i = 0; i < delay; i++) { if (sp->external_remove || external_shutdown) return SANLK_ERROR; sleep(1); } error = delta_lease_leader_read(task, disk, space_name, host_id, &leader, "delta_acquire_wait"); if (error < 0) return error; if (!memcmp(&leader1, &leader, sizeof(struct leader_record))) break; if (leader.timestamp == LEASE_FREE) break; log_erros(sp, "delta_acquire %llu busy %llu %llu %llu %.48s", (unsigned long long)host_id, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); return SANLK_HOSTID_BUSY; } write_new: new_ts = monotime(); leader.timestamp = new_ts; leader.owner_id = host_id; leader.owner_generation++; snprintf(leader.resource_name, NAME_ID_SIZE, "%s", our_host_name); leader.checksum = leader_checksum(&leader); log_space(sp, "delta_acquire %llu write %llu %llu %llu %.48s", (unsigned long long)host_id, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); rv = write_sector(disk, host_id - 1, (char *)&leader, sizeof(struct leader_record), task, "delta_leader"); if (rv < 0) return rv; memcpy(&leader1, &leader, sizeof(struct leader_record)); delay = 2 * task->io_timeout_seconds; log_space(sp, "delta_acquire %llu delta_short_delay %d", (unsigned long long)host_id, delay); for (i = 0; i < delay; i++) { if (sp->external_remove || external_shutdown) return SANLK_ERROR; sleep(1); } error = delta_lease_leader_read(task, disk, space_name, host_id, &leader, "delta_acquire_check"); if (error < 0) return error; if (memcmp(&leader1, &leader, sizeof(struct leader_record))) { log_erros(sp, "delta_acquire %llu busy %llu %llu %llu %.48s", (unsigned long long)host_id, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); return SANLK_HOSTID_BUSY; } memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } int delta_lease_renew(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *bitmap, int prev_result, int *read_result, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record leader; char **p_iobuf; char **p_wbuf; char *wbuf; uint64_t host_id, id_offset, new_ts; int rv, iobuf_len, sector_size, io_timeout_save; if (!leader_last) return -EINVAL; *read_result = SANLK_ERROR; host_id = leader_last->owner_id; iobuf_len = sp->align_size; sector_size = disk->sector_size; /* offset of our leader_record */ id_offset = (host_id - 1) * sector_size; if (id_offset > iobuf_len) return -EINVAL; /* if the previous renew timed out in this initial read, and that read is now complete, we can use that result here instead of discarding it and doing another. */ if (prev_result == SANLK_AIO_TIMEOUT) { if (!task->read_iobuf_timeout_aicb) { /* shouldn't happen, when do_linux_aio returned AIO_TIMEOUT it should have set read_iobuf_timeout_aicb */ log_erros(sp, "delta_renew reap no aicb"); goto skip_reap; } if (!task->iobuf) { /* shouldn't happen */ log_erros(sp, "delta_renew reap no iobuf"); goto skip_reap; } rv = read_iobuf_reap(disk->fd, disk->offset, task->iobuf, iobuf_len, task); log_space(sp, "delta_renew reap %d", rv); if (!rv) { task->read_iobuf_timeout_aicb = NULL; goto read_done; } skip_reap: /* abandon the previous timed out read and try a new one from scratch. the current task->iobuf mem will freed when timeout_aicb completes sometime */ task->read_iobuf_timeout_aicb = NULL; task->iobuf = NULL; } if (task->read_iobuf_timeout_aicb) { /* this could happen get here if there was another read between renewal reads, which timed out and caused read_iobuf_timeout_aicb to be set; I don't think there are any cases where that would happen, though. we could avoid this confusion by passing back the timed out aicb along with SANLK_AIO_TIMEOUT, and only save the timed out aicb when we want to try to reap it later. */ log_space(sp, "delta_renew timeout_aicb is unexpectedly %p iobuf %p", task->read_iobuf_timeout_aicb, task->iobuf); task->read_iobuf_timeout_aicb = NULL; task->iobuf = NULL; } if (!task->iobuf) { /* this will happen the first time renew is called, and after a timed out renewal read fails to be reaped (see task->iobuf = NULL above) */ p_iobuf = &task->iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_erros(sp, "dela_renew memalign rv %d", rv); rv = -ENOMEM; } } rv = read_iobuf(disk->fd, disk->offset, task->iobuf, iobuf_len, task); if (rv) { /* the next time delta_lease_renew() is called, prev_result will be this rv. If this rv is SANLK_AIO_TIMEOUT, we'll try to reap the event */ log_erros(sp, "delta_renew read rv %d offset %llu %s", rv, (unsigned long long)disk->offset, disk->path); return rv; } read_done: *read_result = SANLK_OK; memcpy(&leader, task->iobuf+id_offset, sizeof(struct leader_record)); rv = verify_leader(disk, space_name, host_id, &leader, "delta_renew"); if (rv < 0) return rv; /* We can't always memcmp(&leader, leader_last) because previous writes may have timed out and we don't know if they were actually written or not. We can definately verify that we're still the owner, though, which is the main thing we need to know. */ if (leader.owner_id != leader_last->owner_id || leader.owner_generation != leader_last->owner_generation || memcmp(leader.resource_name, leader_last->resource_name, NAME_ID_SIZE)) { log_erros(sp, "delta_renew not owner"); log_leader_error(0, space_name, host_id, disk, leader_last, "delta_renew_last"); log_leader_error(0, space_name, host_id, disk, &leader, "delta_renew_read"); return SANLK_RENEW_OWNER; } if (prev_result == SANLK_OK && memcmp(&leader, leader_last, sizeof(struct leader_record))) { log_erros(sp, "delta_renew reread mismatch"); log_leader_error(0, space_name, host_id, disk, leader_last, "delta_renew_last"); log_leader_error(0, space_name, host_id, disk, &leader, "delta_renew_read"); return SANLK_RENEW_DIFF; } new_ts = monotime(); if (leader.timestamp >= new_ts) { log_erros(sp, "delta_renew timestamp too small"); } leader.timestamp = new_ts; leader.checksum = leader_checksum(&leader); p_wbuf = &wbuf; rv = posix_memalign((void *)p_wbuf, getpagesize(), sector_size); if (rv) { log_erros(sp, "dela_renew write memalign rv %d", rv); return -ENOMEM; } memset(wbuf, 0, sector_size); memcpy(wbuf, &leader, sizeof(struct leader_record)); memcpy(wbuf+LEADER_RECORD_MAX, bitmap, HOSTID_BITMAP_SIZE); /* extend io timeout for this one write; we need to give this write every chance to succeed, and there's no point in letting it time out. there's nothing we would do but retry it, and timing out and retrying unnecessarily would probably be counter productive. */ io_timeout_save = task->io_timeout_seconds; task->io_timeout_seconds = task->host_dead_seconds; rv = write_iobuf(disk->fd, disk->offset+id_offset, wbuf, sector_size, task); if (rv != SANLK_AIO_TIMEOUT) free(wbuf); task->io_timeout_seconds = io_timeout_save; if (rv < 0) return rv; /* the paper shows doing a delay and another read here, but it seems unnecessary since we do the same at the beginning of the next renewal */ memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } int delta_lease_release(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name GNUC_UNUSED, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record leader; uint64_t host_id; int rv; if (!leader_last) return -EINVAL; host_id = leader_last->owner_id; log_space(sp, "delta_release %llu begin", (unsigned long long)host_id); memcpy(&leader, leader_last, sizeof(struct leader_record)); leader.timestamp = LEASE_FREE; leader.checksum = leader_checksum(&leader); rv = write_sector(disk, host_id - 1, (char *)&leader, sizeof(struct leader_record), task, "delta_leader"); if (rv < 0) return rv; memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } /* the host_id lease area begins disk->offset bytes from the start of block device disk->path */ int delta_lease_init(struct task *task, struct sync_disk *disk, char *space_name, int max_hosts) { struct leader_record *leader; char *iobuf, **p_iobuf; int iobuf_len; int align_size; int i, rv; if (!max_hosts) max_hosts = DEFAULT_MAX_HOSTS; align_size = direct_align(disk); if (align_size < 0) return align_size; if (disk->sector_size * max_hosts > align_size) return -E2BIG; iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); /* host_id N is block offset N-1 */ for (i = 0; i < max_hosts; i++) { leader = (struct leader_record *)(iobuf + (i * disk->sector_size)); leader->magic = DELTA_DISK_MAGIC; leader->version = DELTA_DISK_VERSION_MAJOR | DELTA_DISK_VERSION_MINOR; leader->sector_size = disk->sector_size; leader->max_hosts = 1; leader->timestamp = LEASE_FREE; strncpy(leader->space_name, space_name, NAME_ID_SIZE); leader->checksum = leader_checksum(leader); } rv = write_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task); if (rv != SANLK_AIO_TIMEOUT) free(iobuf); if (rv < 0) return rv; return 0; } sanlock-2.2/src/log.c0000644000175100017510000001442711751766670013547 0ustar weberweber/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #define LOG_STR_LEN 512 static char log_str[LOG_STR_LEN]; static pthread_t thread_handle; static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t log_cond = PTHREAD_COND_INITIALIZER; static char log_dump[LOG_DUMP_SIZE]; static unsigned int log_point; static unsigned int log_wrap; struct entry { int level; char str[LOG_STR_LEN]; }; #define LOG_DEFAULT_ENTRIES 4096 static struct entry *log_ents; static unsigned int log_num_ents = LOG_DEFAULT_ENTRIES; static unsigned int log_head_ent; /* add at head */ static unsigned int log_tail_ent; /* remove from tail */ static unsigned int log_dropped; static unsigned int log_pending_ents; static unsigned int log_thread_done; static char logfile_path[PATH_MAX]; static FILE *logfile_fp; extern int log_logfile_priority; extern int log_syslog_priority; extern int log_stderr_priority; static void _log_save_dump(int level GNUC_UNUSED, int len) { int i; if (len < LOG_DUMP_SIZE - log_point) { memcpy(log_dump+log_point, log_str, len); log_point += len; if (log_point == LOG_DUMP_SIZE) { log_point = 0; log_wrap = 1; } return; } for (i = 0; i < len; i++) { log_dump[log_point++] = log_str[i]; if (log_point == LOG_DUMP_SIZE) { log_point = 0; log_wrap = 1; } } } static void _log_save_ent(int level, int len) { struct entry *e; if (!log_ents) return; if (log_pending_ents == log_num_ents) { log_dropped++; return; } e = &log_ents[log_head_ent++]; log_head_ent = log_head_ent % log_num_ents; log_pending_ents++; e->level = level; memcpy(e->str, log_str, len); } /* * This log function: * 1. formats the log message in the log_str buffer * 2. copies log_str into the log_dump circular buffer * 3. copies log_str into the log_ents circular array to be written to * logfile and/or syslog (so callers don't block writing messages to files) */ void log_level(uint32_t space_id, uint32_t token_id, char *name_in, int level, const char *fmt, ...) { va_list ap; char name[NAME_ID_SIZE + 1]; int ret, pos = 0; int len = LOG_STR_LEN - 2; /* leave room for \n\0 */ memset(name, 0, sizeof(name)); if (space_id && !token_id) snprintf(name, NAME_ID_SIZE, "s%u ", space_id); else if (!space_id && token_id) snprintf(name, NAME_ID_SIZE, "r%u ", token_id); else if (space_id && token_id) snprintf(name, NAME_ID_SIZE, "s%u:r%u ", space_id, token_id); else if (name_in) snprintf(name, NAME_ID_SIZE, "%.8s ", name_in); pthread_mutex_lock(&log_mutex); ret = snprintf(log_str + pos, len - pos, "%llu %s", (unsigned long long)monotime(), name); pos += ret; va_start(ap, fmt); ret = vsnprintf(log_str + pos, len - pos, fmt, ap); va_end(ap); if (ret >= len - pos) pos = len - 1; else pos += ret; log_str[pos++] = '\n'; log_str[pos++] = '\0'; /* * save all messages in circular buffer "log_dump" that can be * sent over unix socket */ _log_save_dump(level, pos - 1); /* * save some messages in circular array "log_ents" that a thread * writes to logfile/syslog */ if (level <= log_logfile_priority || level <= log_syslog_priority) _log_save_ent(level, pos); if (level <= log_stderr_priority) fprintf(stderr, "%s", log_str); pthread_cond_signal(&log_cond); pthread_mutex_unlock(&log_mutex); } static void write_entry(int level, char *str) { if ((level <= log_logfile_priority) && logfile_fp) { fprintf(logfile_fp, "%s", str); fflush(logfile_fp); } if (level <= log_syslog_priority) syslog(level, "%s", str); } static void write_dropped(int level, int num) { char str[LOG_STR_LEN]; sprintf(str, "dropped %d entries", num); write_entry(level, str); } void copy_log_dump(char *buf, int *len) { int tail_len; pthread_mutex_lock(&log_mutex); if (!log_wrap && !log_point) { *len = 0; } else if (log_wrap) { tail_len = LOG_DUMP_SIZE - log_point; memcpy(buf, log_dump+log_point, tail_len); if (log_point) memcpy(buf+tail_len, log_dump, log_point); *len = LOG_DUMP_SIZE; } else { memcpy(buf, log_dump, log_point-1); *len = log_point-1; } pthread_mutex_unlock(&log_mutex); } static void *log_thread_fn(void *arg GNUC_UNUSED) { char str[LOG_STR_LEN]; struct entry *e; int level, prev_dropped = 0; while (1) { pthread_mutex_lock(&log_mutex); while (log_head_ent == log_tail_ent) { if (log_thread_done) { pthread_mutex_unlock(&log_mutex); goto out; } pthread_cond_wait(&log_cond, &log_mutex); } e = &log_ents[log_tail_ent++]; log_tail_ent = log_tail_ent % log_num_ents; log_pending_ents--; memcpy(str, e->str, LOG_STR_LEN); level = e->level; prev_dropped = log_dropped; log_dropped = 0; pthread_mutex_unlock(&log_mutex); if (prev_dropped) { write_dropped(level, prev_dropped); prev_dropped = 0; } write_entry(level, str); } out: pthread_exit(NULL); } int setup_logging(void) { int fd, rv; snprintf(logfile_path, PATH_MAX, "%s/%s", SANLK_LOG_DIR, SANLK_LOGFILE_NAME); logfile_fp = fopen(logfile_path, "a+"); if (logfile_fp) { fd = fileno(logfile_fp); fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC); } log_ents = malloc(log_num_ents * sizeof(struct entry)); if (!log_ents) { fclose(logfile_fp); logfile_fp = NULL; return -1; } memset(log_ents, 0, log_num_ents * sizeof(struct entry)); openlog(DAEMON_NAME, LOG_CONS | LOG_PID, LOG_DAEMON); rv = pthread_create(&thread_handle, NULL, log_thread_fn, NULL); if (rv) return -1; return 0; } void close_logging(void) { pthread_mutex_lock(&log_mutex); log_thread_done = 1; pthread_cond_signal(&log_cond); pthread_mutex_unlock(&log_mutex); pthread_join(thread_handle, NULL); pthread_mutex_lock(&log_mutex); closelog(); if (logfile_fp) { fclose(logfile_fp); logfile_fp = NULL; } pthread_mutex_unlock(&log_mutex); } sanlock-2.2/README.license0000644000175100017510000000060211751766670014322 0ustar weberweberLGPLv2+ src/libsanlock_client.so src/sanlock.h src/sanlock_rv.h src/sanlock_admin.h src/sanlock_resource.h src/sanlock_sock.h src/sanlock_sock.c src/client.c wdmd/libwdmd.so wdmd/wdmd.h wdmd/wdmd_sock.h wdmd/wdmd_sock.c wdmd/client.c GPLv2 src/list.h (copied from linux kernel) GPLv2+ src/crc32c.c (copied from btrfs-progs which copied from linux kernel) all other original files sanlock-2.2/python/0000755000175100017510000000000011751766670013344 5ustar weberwebersanlock-2.2/python/sanlock.c0000644000175100017510000004157611751766670015157 0ustar weberweber/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #ifndef __unused #define __unused __attribute__ ((unused)) #endif /* Sanlock module */ PyDoc_STRVAR(pydoc_sanlock, "\ Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.\n\ This copyrighted material is made available to anyone wishing to use,\n\ modify, copy, or redistribute it subject to the terms and conditions\n\ of the GNU General Public License v2 or (at your option) any later version."); PyObject *py_module; /* Sanlock exception */ static PyObject *py_exception; static void __set_exception(int en, char *msg) { char *err_name; PyObject *exc_tuple; if (en < 0 && en > -200) { en = -en; err_name = strerror(en); } else { err_name = "Sanlock exception"; } exc_tuple = Py_BuildValue("(iss)", en, msg, err_name); if (exc_tuple == NULL) { PyErr_NoMemory(); } else { PyErr_SetObject(py_exception, exc_tuple); Py_DECREF(exc_tuple); } } static int __parse_resource(PyObject *obj, struct sanlk_resource **res_ret) { int i, num_disks, res_len; struct sanlk_resource *res; num_disks = PyList_Size(obj); res_len = sizeof(struct sanlk_resource) + (sizeof(struct sanlk_disk) * num_disks); res = malloc(res_len); if (res == NULL) { PyErr_NoMemory(); return -1; } memset(res, 0, res_len); res->num_disks = num_disks; for (i = 0; i < num_disks; i++) { char *p = NULL; PyObject *tuple, *path = NULL, *offset = NULL; tuple = PyList_GetItem(obj, i); if (PyTuple_Check(tuple)) { if (PyTuple_Size(tuple) != 2) { __set_exception(EINVAL, "Invalid resource tuple"); goto exit_fail; } path = PyTuple_GetItem(tuple, 0); offset = PyTuple_GetItem(tuple, 1); p = PyString_AsString(path); if (!PyInt_Check(offset)) { __set_exception(EINVAL, "Invalid resource offset"); goto exit_fail; } } else if (PyString_Check(tuple)) { p = PyString_AsString(tuple); } if (p == NULL) { __set_exception(EINVAL, "Invalid resource path"); goto exit_fail; } strncpy(res->disks[i].path, p, SANLK_PATH_LEN - 1); if (offset == NULL) { res->disks[i].offset = 0; } else { res->disks[i].offset = PyInt_AsLong(offset); } } *res_ret = res; return 0; exit_fail: free(res); return -1; } /* register */ PyDoc_STRVAR(pydoc_register, "\ register() -> int\n\ Register to sanlock daemon and return the connection fd."); static PyObject * py_register(PyObject *self __unused, PyObject *args) { int sanlockfd; sanlockfd = sanlock_register(); if (sanlockfd < 0) { __set_exception(sanlockfd, "Sanlock registration failed"); return NULL; } return PyInt_FromLong(sanlockfd); } /* get_alignment */ PyDoc_STRVAR(pydoc_get_alignment, "\ get_alignment(path) -> int\n\ Get device alignment."); static PyObject * py_get_alignment(PyObject *self __unused, PyObject *args) { int rv; const char *path; struct sanlk_disk disk; /* parse python tuple */ if (!PyArg_ParseTuple(args, "s", &path)) { return NULL; } memset(&disk, 0, sizeof(struct sanlk_disk)); strncpy(disk.path, path, SANLK_PATH_LEN - 1); /* get device alignment (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_align(&disk); Py_END_ALLOW_THREADS if (rv < 0) { __set_exception(rv, "Unable to get device alignment"); return NULL; } return PyInt_FromLong(rv); } /* init_lockspace */ PyDoc_STRVAR(pydoc_init_lockspace, "\ init_lockspace(lockspace, path, offset=0, max_hosts=0, num_hosts=0, \ use_aio=True)\n\ Initialize a device to be used as sanlock lockspace."); static PyObject * py_init_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, max_hosts = 0, num_hosts = 0, use_aio = 1; const char *lockspace, *path; struct sanlk_lockspace ls; static char *kwlist[] = {"lockspace", "path", "offest", "max_hosts", "num_hosts", "use_aio", NULL}; /* initialize lockspace structure */ memset(&ls, 0, sizeof(struct sanlk_lockspace)); /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss|kiii", kwlist, &lockspace, &path, &ls.host_id_disk.offset, &max_hosts, &num_hosts, &use_aio)) { return NULL; } /* prepare sanlock names */ strncpy(ls.name, lockspace, SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, path, SANLK_PATH_LEN - 1); /* init sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_init(&ls, NULL, max_hosts, num_hosts, use_aio); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock lockspace init failure"); return NULL; } Py_RETURN_NONE; } /* init_resource */ PyDoc_STRVAR(pydoc_init_resource, "\ init_resource(lockspace, resource, disks, max_hosts=0, num_hosts=0, \ use_aio=True)\n\ Initialize a device to be used as sanlock resource.\n\ The disks must be in the format: [(path, offset), ... ]"); static PyObject * py_init_resource(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, max_hosts = 0, num_hosts = 0, use_aio = 1; const char *lockspace, *resource; struct sanlk_resource *res; PyObject *disks; static char *kwlist[] = {"lockspace", "resource", "disks", "max_hosts", "num_hosts", "use_aio", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "ssO!|iii", kwlist, &lockspace, &resource, &PyList_Type, &disks, &max_hosts, &num_hosts, &use_aio)) { return NULL; } /* parse and check sanlock resource */ if (__parse_resource(disks, &res) != 0) { return NULL; } /* prepare sanlock names */ strncpy(res->lockspace_name, lockspace, SANLK_NAME_LEN); strncpy(res->name, resource, SANLK_NAME_LEN); /* init sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_init(NULL, res, max_hosts, num_hosts, use_aio); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock resource init failure"); goto exit_fail; } free(res); Py_RETURN_NONE; exit_fail: free(res); return NULL; } /* add_lockspace */ PyDoc_STRVAR(pydoc_add_lockspace, "\ add_lockspace(lockspace, host_id, path, offset=0, async=False)\n\ Add a lockspace, acquiring a host_id in it. If async is True the function\n\ will return immediatly and the status can be checked using inq_lockspace."); static PyObject * py_add_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, async = 0, flags = 0; const char *lockspace, *path; struct sanlk_lockspace ls; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", "async", NULL}; /* initialize lockspace structure */ memset(&ls, 0, sizeof(struct sanlk_lockspace)); /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "sks|ki", kwlist, &lockspace, &ls.host_id, &path, &ls.host_id_disk.offset, &async)) { return NULL; } /* prepare sanlock_add_lockspace flags */ if (async) { flags |= SANLK_ADD_ASYNC; } /* prepare sanlock names */ strncpy(ls.name, lockspace, SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, path, SANLK_PATH_LEN - 1); /* add sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_add_lockspace(&ls, flags); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock lockspace add failure"); return NULL; } Py_RETURN_NONE; } /* inq_lockspace */ PyDoc_STRVAR(pydoc_inq_lockspace, "\ inq_lockspace(lockspace, host_id, path, offset=0)\n\ Return True if the sanlock daemon currently owns the host_id in lockspace,\n\ False otherwise. The special value None is returned when the daemon is\n\ still in the process of acquiring or releasing the host_id."); static PyObject * py_inq_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv; const char *lockspace, *path; struct sanlk_lockspace ls; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", NULL}; /* initialize lockspace structure */ memset(&ls, 0, sizeof(struct sanlk_lockspace)); /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "sks|k", kwlist, &lockspace, &ls.host_id, &path, &ls.host_id_disk.offset)) { return NULL; } /* prepare sanlock names */ strncpy(ls.name, lockspace, SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, path, SANLK_PATH_LEN - 1); /* add sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_inq_lockspace(&ls, 0); Py_END_ALLOW_THREADS if (rv == 0) { Py_RETURN_TRUE; } else if (rv == -ENOENT) { Py_RETURN_FALSE; } else if (rv == -EINPROGRESS) { Py_RETURN_NONE; } __set_exception(rv, "Sanlock lockspace inquire failure"); return NULL; } /* rem_lockspace */ PyDoc_STRVAR(pydoc_rem_lockspace, "\ rem_lockspace(lockspace, host_id, path, offset=0, async=False, unused=False)\n\ Remove a lockspace, releasing the acquired host_id. If async is True the\n\ function will return immediatly and the status can be checked using\n\ inq_lockspace. If unused is True the command will fail (EBUSY) if there is\n\ at least one acquired resource in the lockspace (instead of automatically\n\ release it)."); static PyObject * py_rem_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, async = 0, unused = 0, flags = 0; const char *lockspace, *path; struct sanlk_lockspace ls; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", "async", "unused", NULL}; /* initialize lockspace structure */ memset(&ls, 0, sizeof(struct sanlk_lockspace)); /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "sks|kii", kwlist, &lockspace, &ls.host_id, &path, &ls.host_id_disk.offset, &async, &unused)) { return NULL; } /* prepare sanlock names */ strncpy(ls.name, lockspace, SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, path, SANLK_PATH_LEN - 1); /* prepare sanlock_rem_lockspace flags */ if (async) { flags |= SANLK_REM_ASYNC; } if (unused) { flags |= SANLK_REM_UNUSED; } /* remove sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_rem_lockspace(&ls, flags); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock lockspace remove failure"); return NULL; } Py_RETURN_NONE; } /* acquire */ PyDoc_STRVAR(pydoc_acquire, "\ acquire(lockspace, resource, disks [, slkfd=fd, pid=owner, shared=False])\n\ Acquire a resource lease for the current process (using the slkfd argument\n\ to specify the sanlock file descriptor) or for an other process (using the\n\ pid argument). If shared is True the resource will be acquired in the shared\n\ mode.\n\ The disks must be in the format: [(path, offset), ... ]\n"); static PyObject * py_acquire(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, sanlockfd = -1, pid = -1, shared = 0; const char *lockspace, *resource; struct sanlk_resource *res; PyObject *disks; static char *kwlist[] = {"lockspace", "resource", "disks", "slkfd", "pid", "shared", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "ssO!|iii", kwlist, &lockspace, &resource, &PyList_Type, &disks, &sanlockfd, &pid, &shared)) { return NULL; } /* check if any of the slkfd or pid parameters was given */ if (sanlockfd == -1 && pid == -1) { __set_exception(EINVAL, "Invalid slkfd and pid values"); return NULL; } /* parse and check sanlock resource */ if (__parse_resource(disks, &res) != 0) { return NULL; } /* prepare sanlock names */ strncpy(res->lockspace_name, lockspace, SANLK_NAME_LEN); strncpy(res->name, resource, SANLK_NAME_LEN); /* prepare sanlock flags */ if (shared) { res->flags |= SANLK_RES_SHARED; } /* acquire sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_acquire(sanlockfd, pid, 0, 1, &res, 0); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock resource not acquired"); goto exit_fail; } free(res); Py_RETURN_NONE; exit_fail: free(res); return NULL; } /* release */ PyDoc_STRVAR(pydoc_release, "\ release(lockspace, resource, disks [, slkfd=fd, pid=owner])\n\ Release a resource lease for the current process.\n\ The disks must be in the format: [(path, offset), ... ]"); static PyObject * py_release(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, sanlockfd = -1, pid = -1; const char *lockspace, *resource; struct sanlk_resource *res; PyObject *disks; static char *kwlist[] = {"lockspace", "resource", "disks", "slkfd", "pid", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "ssO!|ii", kwlist, &lockspace, &resource, &PyList_Type, &disks, &sanlockfd, &pid)) { return NULL; } /* parse and check sanlock resource */ if (__parse_resource(disks, &res) != 0) { return NULL; } /* prepare sanlock names */ strncpy(res->lockspace_name, lockspace, SANLK_NAME_LEN); strncpy(res->name, resource, SANLK_NAME_LEN); /* release sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_release(sanlockfd, pid, 0, 1, &res); Py_END_ALLOW_THREADS if (rv != 0) { __set_exception(rv, "Sanlock resource not released"); goto exit_fail; } free(res); Py_RETURN_NONE; exit_fail: free(res); return NULL; } /* exception_errno */ PyDoc_STRVAR(pydoc_errno, "exception errno"); static PyObject * py_exception_errno(PyObject *self, PyBaseExceptionObject *exc_obj) { PyObject *exc_errno; exc_errno = PyTuple_GetItem(exc_obj->args, 0); if (exc_errno == NULL) return NULL; Py_INCREF(exc_errno); return exc_errno; } static PyMethodDef sanlock_methods[] = { {"register", py_register, METH_NOARGS, pydoc_register}, {"get_alignment", py_get_alignment, METH_VARARGS, pydoc_get_alignment}, {"init_lockspace", (PyCFunction) py_init_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_init_lockspace}, {"init_resource", (PyCFunction) py_init_resource, METH_VARARGS|METH_KEYWORDS, pydoc_init_resource}, {"add_lockspace", (PyCFunction) py_add_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_add_lockspace}, {"inq_lockspace", (PyCFunction) py_inq_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_inq_lockspace}, {"rem_lockspace", (PyCFunction) py_rem_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_rem_lockspace}, {"acquire", (PyCFunction) py_acquire, METH_VARARGS|METH_KEYWORDS, pydoc_acquire}, {"release", (PyCFunction) py_release, METH_VARARGS|METH_KEYWORDS, pydoc_release}, {NULL, NULL, 0, NULL} }; static PyMethodDef sanlock_exception = { "errno", (PyCFunction) py_exception_errno, METH_O, pydoc_errno }; static void initexception(void) { int rv; PyObject *dict, *func, *meth; dict = PyDict_New(); if (dict == NULL) return; func = PyCFunction_New(&sanlock_exception, NULL); meth = PyObject_CallFunction((PyObject *) &PyProperty_Type, "O", func); Py_DECREF(func); if (meth == NULL) return; rv = PyDict_SetItemString(dict, sanlock_exception.ml_name, meth); Py_DECREF(meth); if (rv < 0) return; py_exception = PyErr_NewException("sanlock.SanlockException", NULL, dict); Py_DECREF(dict); } PyMODINIT_FUNC initsanlock(void) { py_module = Py_InitModule4("sanlock", sanlock_methods, pydoc_sanlock, NULL, PYTHON_API_VERSION); /* Python's module loader doesn't support clean recovery from errors */ if (py_module == NULL) return; /* Initializing sanlock exception */ initexception(); if (py_exception == NULL) return; Py_INCREF(py_exception); PyModule_AddObject(py_module, "SanlockException", py_exception); } sanlock-2.2/python/Makefile0000644000175100017510000000050711751766670015006 0ustar weberweber# Copyright 2010-2011 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v.2. all: python setup.py build install: python setup.py install --root=$(DESTDIR) clean: rm -rf build sanlock-2.2/python/example.py0000644000175100017510000000240111751766670015346 0ustar weberweberimport os import tempfile import sanlock HOST_ID = 1 LOCKSPACE_NAME = "lockspace1" RESOURCE_NAME = "resource1" def main(): print "Creating the sanlock disk" fd, disk = tempfile.mkstemp() os.close(fd) offset = sanlock.get_alignment(disk) SNLK_DISKS = [(disk, offset)] print "Registering to sanlock" fd = sanlock.register() print "Initializing '%s'" % (LOCKSPACE_NAME,) sanlock.init_lockspace(LOCKSPACE_NAME, disk) print "Initializing '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME) sanlock.init_resource(LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS) print "Acquiring the id '%i' on '%s'" % (HOST_ID, LOCKSPACE_NAME) sanlock.add_lockspace(LOCKSPACE_NAME, HOST_ID, disk) try: print "Acquiring '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME) sanlock.acquire(LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, slkfd=fd) print "Releasing '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME) sanlock.release(LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, slkfd=fd) finally: print "Releasing the id '%i' on '%s'" % (HOST_ID, LOCKSPACE_NAME) sanlock.rem_lockspace(LOCKSPACE_NAME, HOST_ID, disk) print "Removing the sanlock disk" os.remove(disk) if __name__ == '__main__': main() sanlock-2.2/python/setup.py0000644000175100017510000000121311751766670015053 0ustar weberweber# Copyright 2010-2011 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v.2. from distutils.core import setup, Extension sanlocklib = ['sanlock'] sanlock = Extension(name = 'sanlock', sources = ['sanlock.c'], include_dirs = ['../src'], library_dirs = ['../src'], libraries = sanlocklib) setup(name = 'Sanlock', version = '1.0', description = 'Sanlock python package', ext_modules = [sanlock])