pax_global_header00006660000000000000000000000064125333076460014523gustar00rootroot0000000000000052 comment=aec1497e2b29f57997ae27351ac01abc25026387 numad-0.5+20150602/000077500000000000000000000000001253330764600133665ustar00rootroot00000000000000numad-0.5+20150602/Makefile000066400000000000000000000032361253330764600150320ustar00rootroot00000000000000# these can (and should) be overridden on the make command line for production CFLAGS := -std=gnu99 # these are used for the benchmarks in addition to the normal CFLAGS. # Normally no need to overwrite unless you find a new magic flag to make # STREAM run faster. BENCH_CFLAGS := -O3 -ffast-math -funroll-loops # for compatibility with old releases CFLAGS += ${OPT_CFLAGS} override CFLAGS += -I. # find out if compiler supports __thread THREAD_SUPPORT := $(shell if $(CC) $(CFLAGS) threadtest.c -o threadtest \ >/dev/null 2>/dev/null ; then echo "yes" ; else echo "no"; fi) ifeq ($(THREAD_SUPPORT),no) override CFLAGS += -D__thread="" endif # find out if compiler supports -ftree-vectorize THREAD_SUPPORT := $(shell touch empty.c ; if $(CC) $(CFLAGS) -c -ftree-vectorize empty.c -o empty.o \ >/dev/null 2>/dev/null ; then echo "yes" ; else echo "no"; fi) ifeq ($(THREAD_SUPPORT),yes) BENCH_CFLAGS += -ftree-vectorize endif CLEANFILES := numad.o numad .depend .depend.X empty.c empty.o SOURCES := numad.c prefix := /usr docdir := ${prefix}/share/doc all: numad LDLIBS := -lpthread -lrt -lm numad: numad.o AR ?= ar RANLIB ?= ranlib .PHONY: install all clean html depend # BB_FIXME MANPAGES := numa.3 numactl.8 numastat.8 migratepages.8 migspeed.8 install: numad mkdir -p ${prefix}/bin mkdir -p ${prefix}/share/man/man8 install -m 0755 numad ${prefix}/bin install -m 0644 numad.8 ${prefix}/share/man/man8 clean: rm -f ${CLEANFILES} @rm -rf html distclean: clean rm -f .[^.]* */.[^.]* rm -f *~ */*~ *.orig */*.orig */*.rej *.rej depend: .depend .depend: ${CC} -MM -DDEPS_RUN -I. ${SOURCES} > .depend.X && mv .depend.X .depend include .depend Makefile: .depend numad-0.5+20150602/numad.8000066400000000000000000000231011253330764600145600ustar00rootroot00000000000000.TH "numad" "8" "1.0.0" "Bill Gray" "Administration" .SH "NAME" .LP numad \- A user\-level daemon that provides placement advice and process management for efficient use of CPUs and memory on systems with NUMA topology. .SH "SYNOPSIS" .LP numad [\fI\-dhvV\fP] .br .LP numad [\fI\-C 0|1\fP] .br .LP numad [\fI\-H THP_hugepage_scan_sleep_ms\fP] .br .LP numad [\fI\-i [min_interval:]max_interval\fP] .br .LP numad [\fI\-K 0|1\fP] .br .LP numad [\fI\-l log_level\fP] .br .LP numad [\fI\-m target_memory_locality\fP] .br .LP numad [\fI\-p PID\fP] .br .LP numad [\fI\-r PID\fP] .br .LP numad [\fI\-R reserved-CPU-list\fP] .br .LP numad [\fI\-S 0|1\fP] .br .LP numad [\fI\-t logical_CPU_percent\fP] .br .LP numad [\fI\-u target_utilization\fP] .br .LP numad [\fI\-w NCPUS[:MB]\fP] .br .LP numad [\fI\-x PID\fP] .br .SH "DESCRIPTION" .LP Numad is a system daemon that monitors NUMA topology and resource usage. It will attempt to locate processes for efficient NUMA locality and affinity, dynamically adjusting to changing system conditions. Numad also provides guidance to assist management applications with initial manual binding of CPU and memory resources for their processes. Note that numad is primarily intended for server consolidation environments, where there might be multiple applications or multiple virtual guests running on the same server system. Numad is most likely to have a positive effect when processes can be localized in a subset of the system's NUMA nodes. If the entire system is dedicated to a large in-memory database application, for example -- especially if memory accesses will likely remain unpredictable -- numad will probably not improve performance. .SH "OPTIONS" .LP .TP \fB\-C\fR <\fI0|1\fP> This option controls whether or not numad treats inactive file cache as available memory. By default, numad assumes it can count inactive file cache as "free" memory when considering resources to match with processes. Specify \fI\-C 0\fP if numad should instead consider inactive file cache as a consumed resource. .TP \fB\-d\fR Debug output in log, sets the log level to LOG_DEBUG. Same effect as \fI\-l 7\fP. .TP \fB\-h\fR Display usage help information and then exit. .TP \fB\-H\fR <\fITHP_scan_sleep_ms\fP> Set the desired transparent hugepage scan interval in ms. The .na /sys/kernel/mm/tranparent_hugepage/khugepaged/scan_sleep_millisecs .ad tunable is usually set to 10000ms by the operating system. The default is changed by numad to be 1000ms since it is helpful for the hugepage daemon to be more aggressive when memory moves between nodes. Specifying (\fI\-H 0\fP) will cause numad to retain the system default value. You can also make the hugepage daemon more or less aggressive by specifying an alternate value with this option. For example, setting this value to 100ms (\fI\-H 100\fP) might improve the performance of workloads which use many transparent hugepages. .TP \fB\-i\fR <\fI[min_interval:]max_interval\fP> Sets the time interval that numad waits between system scans, in seconds to <\fImax_interval\fP>. Default <\fImax_interval\fP> is 15 seconds, default <\fImin_interval\fP> is 5 seconds. Setting a <\fImax_interval\fP> of zero will cause the daemon to exit. (This is the normal mechanism to terminate the daemon.) A bigger <\fImax_interval\fP> will decrease numad overhead but also decrease responsiveness to changing loads. The default numad max_interval can be changed in the numad.conf file. .TP \fB\-K\fR <\fI0|1\fP> This option controls whether numad keeps interleaved memory spread across NUMA nodes, or attempts to merge interleaved memory to local NUMA nodes. The default is to merge interleaved memory. This is the appropriate setting to localize processes in a subset of the system's NUMA nodes. If you are running a large, single-instance application that allocates interleaved memory because the workload will have continuous unpredictable memory access patterns (e.g. a large in-memory database), you might get better results by specifying \fI\-K 1\fP to instruct numad to keep interleaved memory distributed. .TP \fB\-l\fR <\fIlog_level\fP> Sets the log level to <\fIlog_level\fP>. Reasonable choices are 5, 6, or 7. The default value is 5. Note that CPU values are scaled by a factor of 100 internally and in the numad log files. Unfortunately, you don't actually have that many CPUs. .TP \fB\-m\fR <\fItarget_memory_locality\fP> Set the desired memory locality threshold to stop moving process memory. Numad might stop retrying to coalesce process memory when more than this percentage of the process's memory is already localized in the target node(s). The default is 90%. Numad will frequently localize more than the localization threshold percent, but it will not necessarily do so. Decrease the threshold to allow numad to leave more process memory distributed on various nodes. Increase the threshold to instruct numad to try to localize more memory. Acceptable values are between 50 and 100 percent. Note that setting the target memory locality to 100% might cause numad to continually retry to move memory that the kernel will never succesfully move. .TP \fB\-p\fR <\fIPID\fP> Add PID to explicit inclusion list of processes to consider for managing, if the process also uses significant resources. Multiple \fI\-p PID\fP options can be specified at daemon start, but after daemon start, only one PID can be added to the inclusion list per subsequent numad invocation. Use with \-S to precisely control the scope of processes numad can manage. Note that the specified process will not necessarily be actively managed unless it also meets numad's significance threshold -- which is currently 300MB and half of a CPU. .TP \fB\-r\fR <\fIPID\fP> Remove PID from both the explicit inclusion and the exclusion lists of processes. After daemon start, only one PID can be removed from the explicit process lists per subsequent numad invocation. Use with \-S and \-p and \-x to precisely control the scope of processes numad can manage. .TP \fB\-R\fR <\fICPU_LIST\fP> Specify a list of CPUs that numad should assume are reserved for non-numad use. No processes will be bound to the specified CPUs by numad. This option is effective only when starting numad. You cannot change reserved CPUs dynamically while numad is already running. .TP \fB\-S\fR <\fI0|1\fP> This option controls whether numad scans all system processes or only the processes on the explicit inclusion PID list. The default is to scan all processes. Use \fI\-S 0\fP to scan only the explicit inclusion PID list. Use \fI\-S 1\fP to again scan all system processes (excepting those on the explicit exclusion list). Starting numad as .br \fInumad \-S 0 \-p \-p \-p \fP .br will limit scanning, and thus also automatic NUMA management, to only those three explicitly specified processes. .TP \fB\-t\fR <\fIlogical_CPU_percent\fP> Specify the resource value of logical CPUs. Hardware threads typically share most core resources, and so logical CPUs add only a fraction of CPU power for many workloads. By default numad considers logical CPUs to be only 20 percent of a dedicated hardware core. .TP \fB\-u\fR <\fItarget_utilization\fP> Set the desired maximum consumption percentage of a node. Default is 85%. Decrease the target value to maintain more available resource margin on each node. Increase the target value to more exhaustively consume node resources. If you have sized your workloads to precisely fit inside a NUMA node, specifying (\fI\-u 100\fP) might improve system performance by telling numad to go ahead and consume all the resources in each node. It is possible to specify values up to 130 percent to oversubscribe CPUs in the nodes, but memory utilization is always capped at 100%. Use oversubscription values very carefully. .TP \fB\-v\fR Verbose output in log, sets the log level to LOG_INFO. Same effect as \fI\-l 6\fP. .TP \fB\-V\fR Display version information and exit. .TP \fB\-w\fR <\fINCPUS[:MB]\fP> Queries numad for the best NUMA nodes to bind an entity that needs <\fINCPUS\fP>. The amount of memory (in MBs) is optional, but should normally be specified as well <\fI:MB\fP> so numad can recommend NUMA nodes with available CPU capacity and adequate free memory. This query option can be used regardless of whether numad is running as a daemon. (An invocation using this option when numad is not running as a daemon, will not cause the daemon to start.) Output of this option is a string that contains a NUMA node list. For example: 2\-3,6. The recommended node list could be saved in a shell variable (e.g., NODES) and then used as the node list parameter in a .br \fInumactl \-m $NODES \-N $NODES ... \fP .br command. See numactl(8). .TP \fB\-x\fR <\fIPID\fP> Add PID to explicit exclusion list of processes to blacklist from managing. Multiple \fI\-x PID\fP options can be specified at daemon start, but after daemon start, only one PID can be added to the exclusion list per subsequent numad invocation. Use with \-S to precisely control the scope of processes numad can manage. .SH "FILES" .LP \fI/usr/bin/numad\fP .br \fI/etc/numad.conf\fP .br \fI/var/log/numad.log\fP .br \fI/var/run/numad.pid\fP .SH "ENVIRONMENT VARIABLES" .LP .TP None. .SH "EXAMPLES" .LP Numad can be run as a system daemon and can be managed by the standard init mechanisms of the host. .LP If interactive (manual) control is desired, you can start the daemon manually by typing: .LP /usr/bin/numad .LP Subsequent numad invocations while the daemon is running can be used to dynamically change most run-time options. .LP You can terminate numad from running by typing: .LP /usr/bin/numad -i0 .SH "AUTHORS" .LP Bill Gray .SH "SEE ALSO" .LP numactl(8) numad-0.5+20150602/numad.c000066400000000000000000003102661253330764600146460ustar00rootroot00000000000000/* numad - NUMA Daemon to automatically bind processes to NUMA nodes Copyright (C) 2012 Bill Gray (bgray@redhat.com), Red Hat Inc numad is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. numad is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ // Compile with: gcc -std=gnu99 -g -Wall -pthread -o numad numad.c -lrt -lm #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VERSION_STRING "20150602" #define VAR_RUN_FILE "/var/run/numad.pid" #define VAR_LOG_FILE "/var/log/numad.log" #define KILOBYTE (1024) #define MEGABYTE (1024 * 1024) #define FNAME_SIZE 192 #define BUF_SIZE 1024 #define BIG_BUF_SIZE 4096 // The ONE_HUNDRED factor is used to scale time and CPU usage units. // Several CPU quantities are measured in percents of a CPU; and // several time values are counted in hundreths of a second. #define ONE_HUNDRED 100 #define MIN_INTERVAL 5 #define MAX_INTERVAL 15 #define CPU_THRESHOLD 50 #define MEMORY_THRESHOLD 300 #define DEFAULT_HTT_PERCENT 20 #define DEFAULT_THP_SCAN_SLEEP_MS 1000 #define DEFAULT_UTILIZATION_PERCENT 85 #define DEFAULT_MEMLOCALITY_PERCENT 90 #define CONVERT_DIGITS_TO_NUM(p, n) \ n = *p++ - '0'; \ while (isdigit(*p)) { \ n *= 10; \ n += (*p++ - '0'); \ } int num_cpus = 0; int num_nodes = 0; int threads_per_core = 0; uint64_t page_size_in_bytes = 0; uint64_t huge_page_size_in_bytes = 0; int min_interval = MIN_INTERVAL; int max_interval = MAX_INTERVAL; int htt_percent = DEFAULT_HTT_PERCENT; int thp_scan_sleep_ms = DEFAULT_THP_SCAN_SLEEP_MS; int target_utilization = DEFAULT_UTILIZATION_PERCENT; int target_memlocality = DEFAULT_MEMLOCALITY_PERCENT; int scan_all_processes = 1; int keep_interleaved_memory = 0; int use_inactive_file_cache = 1; pthread_mutex_t pid_list_mutex; pthread_mutex_t node_info_mutex; long sum_CPUs_total = 0; int requested_mbs = 0; int requested_cpus = 0; int got_sighup = 0; int got_sigterm = 0; int got_sigquit = 0; void sig_handler(int signum) { switch (signum) { case SIGHUP: got_sighup = 1; break; case SIGTERM: got_sigterm = 1; break; case SIGQUIT: got_sigquit = 1; break; } } FILE *log_fs = NULL; int log_level = LOG_NOTICE; void numad_log(int level, const char *fmt, ...) { if (level > log_level) { return; // Logging levels (from sys/syslog.h) // #define LOG_EMERG 0 /* system is unusable */ // #define LOG_ALERT 1 /* action must be taken immediately */ // #define LOG_CRIT 2 /* critical conditions */ // #define LOG_ERR 3 /* error conditions */ // #define LOG_WARNING 4 /* warning conditions */ // #define LOG_NOTICE 5 /* normal but significant condition */ // #define LOG_INFO 6 /* informational */ // #define LOG_DEBUG 7 /* debug-level messages */ } char buf[BUF_SIZE]; time_t ts = time(NULL); strncpy(buf, ctime(&ts), sizeof(buf)); char *p = &buf[strlen(buf) - 1]; *p++ = ':'; *p++ = ' '; va_list ap; va_start(ap, fmt); vsnprintf(p, BUF_SIZE, fmt, ap); va_end(ap); fprintf(log_fs, "%s", buf); fflush(log_fs); } void open_log_file() { log_fs = fopen(VAR_LOG_FILE, "a"); if (log_fs == NULL) { log_fs = stderr; numad_log(LOG_ERR, "Cannot open numad log file (errno: %d) -- using stderr\n", errno); } } void close_log_file() { if (log_fs != NULL) { if (log_fs != stderr) { fclose(log_fs); } log_fs = NULL; } } #define MSG_BODY_TEXT_SIZE 96 typedef struct msg_body { long src_pid; long cmd; long arg1; long arg2; char text[MSG_BODY_TEXT_SIZE]; } msg_body_t, *msg_body_p; typedef struct msg { long dst_pid; // msg mtype is dest PID msg_body_t body; } msg_t, *msg_p; int msg_qid; void flush_msg_queue() { msg_t msg; do { msgrcv(msg_qid, &msg, sizeof(msg_body_t), 0, IPC_NOWAIT); } while (errno != ENOMSG); } void init_msg_queue() { key_t msg_key = 0xdeadbeef; int msg_flg = 0660 | IPC_CREAT; msg_qid = msgget(msg_key, msg_flg); if (msg_qid < 0) { numad_log(LOG_CRIT, "msgget failed\n"); exit(EXIT_FAILURE); } flush_msg_queue(); } void recv_msg(msg_p m) { if (msgrcv(msg_qid, m, sizeof(msg_body_t), getpid(), 0) < 0) { numad_log(LOG_CRIT, "msgrcv failed\n"); exit(EXIT_FAILURE); } // printf("Received: >>%s<< from process %d\n", m->body.text, m->body.src_pid); } void send_msg(long dst_pid, long cmd, long arg1, long arg2, char *s) { msg_t msg; msg.dst_pid = dst_pid; msg.body.src_pid = getpid(); msg.body.cmd = cmd; msg.body.arg1 = arg1; msg.body.arg2 = arg2; int s_len = strlen(s); if (s_len >= MSG_BODY_TEXT_SIZE) { numad_log(LOG_CRIT, "msgsnd text too big\n"); exit(EXIT_FAILURE); } strcpy(msg.body.text, s); size_t m_len = sizeof(msg_body_t) - MSG_BODY_TEXT_SIZE + s_len + 1; if (msgsnd(msg_qid, &msg, m_len, IPC_NOWAIT) < 0) { numad_log(LOG_CRIT, "msgsnd failed\n"); exit(EXIT_FAILURE); } // printf("Sent: >>%s<< to process %d\n", msg.body.text, msg.dst_pid); } typedef struct id_list { // Use CPU_SET(3) bitmasks, // but bundle size and pointer together // and genericize for both CPU and Node IDs cpu_set_t *set_p; size_t bytes; } id_list_t, *id_list_p; #define ID_LIST_SET_P(list_p) (list_p->set_p) #define ID_LIST_BYTES(list_p) (list_p->bytes) #define INIT_ID_LIST(list_p, num_elements) \ list_p = malloc(sizeof(id_list_t)); \ if (list_p == NULL) { numad_log(LOG_CRIT, "INIT_ID_LIST malloc failed\n"); exit(EXIT_FAILURE); } \ list_p->set_p = CPU_ALLOC(num_elements); \ if (list_p->set_p == NULL) { numad_log(LOG_CRIT, "CPU_ALLOC failed\n"); exit(EXIT_FAILURE); } \ list_p->bytes = CPU_ALLOC_SIZE(num_elements); #define CLEAR_CPU_LIST(list_p) \ if (list_p == NULL) { \ INIT_ID_LIST(list_p, num_cpus); \ } \ CPU_ZERO_S(list_p->bytes, list_p->set_p) #define CLEAR_NODE_LIST(list_p) \ if (list_p == NULL) { \ INIT_ID_LIST(list_p, num_nodes); \ } \ CPU_ZERO_S(list_p->bytes, list_p->set_p) #define FREE_LIST(list_p) \ if (list_p != NULL) { \ if (list_p->set_p != NULL) { CPU_FREE(list_p->set_p); } \ free(list_p); \ list_p = NULL; \ } #define COPY_LIST(orig_list_p, copy_list_p) \ memcpy(copy_list_p->set_p, orig_list_p->set_p, orig_list_p->bytes) #define NUM_IDS_IN_LIST(list_p) CPU_COUNT_S(list_p->bytes, list_p->set_p) #define ADD_ID_TO_LIST(k, list_p) CPU_SET_S(k, list_p->bytes, list_p->set_p) #define CLR_ID_IN_LIST(k, list_p) CPU_CLR_S(k, list_p->bytes, list_p->set_p) #define ID_IS_IN_LIST(k, list_p) CPU_ISSET_S(k, list_p->bytes, list_p->set_p) #define EQUAL_LISTS(list_1_p, list_2_p) CPU_EQUAL_S(list_1_p->bytes, list_1_p->set_p, list_2_p->set_p) #define AND_LISTS(and_list_p, list_1_p, list_2_p) CPU_AND_S(and_list_p->bytes, and_list_p->set_p, list_1_p->set_p, list_2_p->set_p) #define OR_LISTS( or_list_p, list_1_p, list_2_p) CPU_OR_S( or_list_p->bytes, or_list_p->set_p, list_1_p->set_p, list_2_p->set_p) #define XOR_LISTS(xor_list_p, list_1_p, list_2_p) CPU_XOR_S(xor_list_p->bytes, xor_list_p->set_p, list_1_p->set_p, list_2_p->set_p) int negate_cpu_list(id_list_p list_p) { if (list_p == NULL) { numad_log(LOG_CRIT, "Cannot negate a NULL list\n"); exit(EXIT_FAILURE); } if (num_cpus < 1) { numad_log(LOG_CRIT, "No CPUs to negate in list!\n"); exit(EXIT_FAILURE); } for (int ix = 0; (ix < num_cpus); ix++) { if (ID_IS_IN_LIST(ix, list_p)) { CLR_ID_IN_LIST(ix, list_p); } else { ADD_ID_TO_LIST(ix, list_p); } } return NUM_IDS_IN_LIST(list_p); } int add_ids_to_list_from_str(id_list_p list_p, char *s) { if (list_p == NULL) { numad_log(LOG_CRIT, "Cannot add to NULL list\n"); exit(EXIT_FAILURE); } if ((s == NULL) || (strlen(s) == 0)) { goto return_list; } int in_range = 0; int next_id = 0; for (;;) { // skip over non-digits while (!isdigit(*s)) { if ((*s == '\n') || (*s == '\0')) { goto return_list; } if (*s++ == '-') { in_range = 1; } } int id; CONVERT_DIGITS_TO_NUM(s, id); if (!in_range) { next_id = id; } for (; (next_id <= id); next_id++) { ADD_ID_TO_LIST(next_id, list_p); } in_range = 0; } return_list: return NUM_IDS_IN_LIST(list_p); } int str_from_id_list(char *str_p, int str_size, id_list_p list_p) { char *p = str_p; if ((p == NULL) || (str_size < 3)) { numad_log(LOG_CRIT, "Bad string for ID listing\n"); exit(EXIT_FAILURE); } int n; if ((list_p == NULL) || ((n = NUM_IDS_IN_LIST(list_p)) == 0)) { goto terminate_string; } int id_range_start = -1; for (int id = 0; ; id++) { int id_in_list = (ID_IS_IN_LIST(id, list_p) != 0); if ((id_in_list) && (id_range_start < 0)) { id_range_start = id; // beginning an ID range } else if ((!id_in_list) && (id_range_start >= 0)) { // convert the range that just ended... p += snprintf(p, (str_p + str_size - p - 1), "%d", id_range_start); if (id - id_range_start > 1) { *p++ = '-'; p += snprintf(p, (str_p + str_size - p - 1), "%d", (id - 1)); } *p++ = ','; id_range_start = -1; // no longer in a range if (n <= 0) { break; } // exit only after finishing a range } n -= id_in_list; } p -= 1; // eliminate trailing ',' terminate_string: *p = '\0'; return (p - str_p); } typedef struct node_data { uint64_t node_id; uint64_t MBs_total; uint64_t MBs_free; uint64_t CPUs_total; // scaled * ONE_HUNDRED uint64_t CPUs_free; // scaled * ONE_HUNDRED uint64_t magnitude; // hack: MBs * CPUs uint8_t *distance; id_list_p cpu_list_p; } node_data_t, *node_data_p; node_data_p node = NULL; int min_node_CPUs_free_ix = -1; int min_node_MBs_free_ix = -1; long min_node_CPUs_free = MAXINT; long min_node_MBs_free = MAXINT; long max_node_CPUs_free = 0; long max_node_MBs_free = 0; long avg_node_CPUs_free = 0; long avg_node_MBs_free = 0; double stddev_node_CPUs_free = 0.0; double stddev_node_MBs_free = 0.0; // RING_BUF_SIZE must be a power of two #define RING_BUF_SIZE 8 #define PROCESS_FLAG_INTERLEAVED (1 << 0) typedef struct process_data { int pid; unsigned int flags; uint64_t data_time_stamp; // hundredths of seconds uint64_t bind_time_stamp; uint64_t num_threads; uint64_t MBs_size; uint64_t MBs_used; uint64_t cpu_util; uint64_t CPUs_used; // scaled * ONE_HUNDRED uint64_t CPUs_used_ring_buf[RING_BUF_SIZE]; int ring_buf_ix; char *comm; id_list_p node_list_p; uint64_t *process_MBs; } process_data_t, *process_data_p; // Hash table size must always be a power of two #define MIN_PROCESS_HASH_TABLE_SIZE 16 int process_hash_table_size = 0; int process_hash_collisions = 0; process_data_p process_hash_table = NULL; int process_hash_ix(int pid) { unsigned ix = pid; ix *= 717; ix >>= 8; ix &= (process_hash_table_size - 1); return ix; } int process_hash_lookup(int pid) { int ix = process_hash_ix(pid); int starting_ix = ix; while (process_hash_table[ix].pid) { // Assumes table with some blank entries... if (pid == process_hash_table[ix].pid) { return ix; // found it } ix += 1; ix &= (process_hash_table_size - 1); if (ix == starting_ix) { // Table full and pid not found. // This "should never happen"... break; } } return -1; } int process_hash_insert(int pid) { // This reserves the hash table slot, but initializes only the pid field int ix = process_hash_ix(pid); int starting_ix = ix; while (process_hash_table[ix].pid) { if (pid == process_hash_table[ix].pid) { return ix; // found it } process_hash_collisions += 1; ix += 1; ix &= (process_hash_table_size - 1); if (ix == starting_ix) { // This "should never happen"... numad_log(LOG_ERR, "Process hash table is full\n"); return -1; } } process_hash_table[ix].pid = pid; return ix; } int process_hash_update(process_data_p newp) { // This updates hash table stats for processes we are monitoring. Only the // scalar resource consumption stats need to be updated here. int new_hash_table_entry = 1; int ix = process_hash_insert(newp->pid); if (ix >= 0) { process_data_p p = &process_hash_table[ix]; if (p->data_time_stamp) { new_hash_table_entry = 0; p->ring_buf_ix += 1; p->ring_buf_ix &= (RING_BUF_SIZE - 1); uint64_t cpu_util_diff = newp->cpu_util - p->cpu_util; uint64_t time_diff = newp->data_time_stamp - p->data_time_stamp; p->CPUs_used_ring_buf[p->ring_buf_ix] = 100 * (cpu_util_diff) / time_diff; // Use largest CPU utilization currently in ring buffer uint64_t max_CPUs_used = p->CPUs_used_ring_buf[0]; for (int ix = 1; (ix < RING_BUF_SIZE); ix++) { if (max_CPUs_used < p->CPUs_used_ring_buf[ix]) { max_CPUs_used = p->CPUs_used_ring_buf[ix]; } } p->CPUs_used = max_CPUs_used; } if ((!p->comm) || (strcmp(p->comm, newp->comm))) { if (p->comm) { free(p->comm); } p->comm = strdup(newp->comm); } p->MBs_size = newp->MBs_size; p->MBs_used = newp->MBs_used; p->cpu_util = newp->cpu_util; p->num_threads = newp->num_threads; p->data_time_stamp = newp->data_time_stamp; } return new_hash_table_entry; } void process_hash_clear_all_bind_time_stamps() { for (int ix = 0; (ix < process_hash_table_size); ix++) { process_hash_table[ix].bind_time_stamp = 0; } } int process_hash_rehash(int old_ix) { // Given the index of a table entry that would otherwise be orphaned by // process_hash_remove(), reinsert into table using PID from existing record. process_data_p op = &process_hash_table[old_ix]; int new_ix = process_hash_insert(op->pid); if (new_ix >= 0) { // Copy old slot to new slot, and zero old slot process_data_p np = &process_hash_table[new_ix]; memcpy(np, op, sizeof(process_data_t)); memset(op, 0, sizeof(process_data_t)); } return new_ix; } int process_hash_remove(int pid) { int ix = process_hash_lookup(pid); if (ix >= 0) { // remove the target process_data_p dp = &process_hash_table[ix]; if (dp->comm) { free(dp->comm); } if (dp->process_MBs) { free(dp->process_MBs); } FREE_LIST(dp->node_list_p); memset(dp, 0, sizeof(process_data_t)); // bubble up the collision chain and rehash if neeeded for (;;) { ix += 1; ix &= (process_hash_table_size - 1); if ((pid = process_hash_table[ix].pid) <= 0) { break; } if (process_hash_lookup(pid) < 0) { if (process_hash_rehash(ix) < 0) { numad_log(LOG_ERR, "rehash fail\n"); } } } } return ix; } void process_hash_table_expand() { // Save old table size and address int old_size = process_hash_table_size; process_data_p old_table = process_hash_table; // Double size of table and allocate new space if (old_size > 0) { process_hash_table_size *= 2; } else { process_hash_table_size = MIN_PROCESS_HASH_TABLE_SIZE; } numad_log(LOG_DEBUG, "Expanding hash table size: %d\n", process_hash_table_size); process_hash_table = malloc(process_hash_table_size * sizeof(process_data_t)); if (process_hash_table == NULL) { numad_log(LOG_CRIT, "hash table malloc failed\n"); exit(EXIT_FAILURE); } // Clear the new table, and copy valid entries from old table memset(process_hash_table, 0, process_hash_table_size * sizeof(process_data_t)); for (int ix = 0; (ix < old_size); ix++) { process_data_p p = &old_table[ix]; if (p->pid) { int new_table_ix = process_hash_insert(p->pid); memcpy(&process_hash_table[new_table_ix], p, sizeof(process_data_t)); } } if (old_table != NULL) { free(old_table); } } void process_hash_table_dump() { for (int ix = 0; (ix < process_hash_table_size); ix++) { process_data_p p = &process_hash_table[ix]; if (p->pid) { numad_log(LOG_DEBUG, "ix: %d PID: %d %s Thds: %d CPU %ld MBs: %ld/%ld Data TS: %ld Bind TS: %ld\n", ix, p->pid, ((p->comm != NULL) ? p->comm : "(Null)"), p->num_threads, p->CPUs_used, p->MBs_used, p->MBs_size, p->data_time_stamp, p->bind_time_stamp); // FIXME: make this dump every field, but this is not even currently used } } } void process_hash_table_cleanup(uint64_t update_time) { int num_hash_entries_used = 0; for (int ix = 0; (ix < process_hash_table_size); ix++) { process_data_p p = &process_hash_table[ix]; if (p->pid) { num_hash_entries_used += 1; if (p->data_time_stamp < update_time) { // Mark as old, and zero CPU utilization p->data_time_stamp = 0; p->CPUs_used = 0; // Check for dead pids and remove them... if ((kill(p->pid, 0) == -1) && (errno == ESRCH)) { // Seems dead. Forget this pid process_hash_remove(p->pid); num_hash_entries_used -= 1; } } } } // Keep hash table approximately half empty if ((num_hash_entries_used * 7) / 4 > process_hash_table_size) { process_hash_table_expand(); } } typedef struct pid_list { long pid; struct pid_list* next; } pid_list_t, *pid_list_p; pid_list_p include_pid_list = NULL; pid_list_p exclude_pid_list = NULL; pid_list_p insert_pid_into_pid_list(pid_list_p list_ptr, long pid) { if (process_hash_table != NULL) { int hash_ix = process_hash_lookup(pid); if ((hash_ix >= 0) && (list_ptr == include_pid_list)) { // Clear interleaved flag, in case user wants it to be re-evaluated process_hash_table[hash_ix].flags &= ~PROCESS_FLAG_INTERLEAVED; } } // Check for duplicate pid first pid_list_p pid_ptr = list_ptr; while (pid_ptr != NULL) { if (pid_ptr->pid == pid) { // pid already in list return list_ptr; } pid_ptr = pid_ptr->next; } // pid not yet in list -- insert new node pid_ptr = malloc(sizeof(pid_list_t)); if (pid_ptr == NULL) { numad_log(LOG_CRIT, "pid_list malloc failed\n"); exit(EXIT_FAILURE); } pid_ptr->pid = pid; pid_ptr->next = list_ptr; list_ptr = pid_ptr; return list_ptr; } pid_list_p remove_pid_from_pid_list(pid_list_p list_ptr, long pid) { pid_list_p last_pid_ptr = NULL; pid_list_p pid_ptr = list_ptr; while (pid_ptr != NULL) { if (pid_ptr->pid == pid) { if (pid_ptr == list_ptr) { list_ptr = list_ptr->next; free(pid_ptr); pid_ptr = list_ptr; continue; } else { last_pid_ptr->next = pid_ptr->next; free(pid_ptr); pid_ptr = last_pid_ptr; } } last_pid_ptr = pid_ptr; pid_ptr = pid_ptr->next; } return list_ptr; } void shut_down_numad() { numad_log(LOG_NOTICE, "Shutting down numad\n"); flush_msg_queue(); unlink(VAR_RUN_FILE); close_log_file(); exit(EXIT_SUCCESS); } void print_version_and_exit(char *prog_name) { fprintf(stdout, "%s version: %s: compiled %s\n", prog_name, VERSION_STRING, __DATE__); exit(EXIT_SUCCESS); } void print_usage_and_exit(char *prog_name) { fprintf(stderr, "Usage: %s ...\n", prog_name); fprintf(stderr, "-C 1 to count inactive file cache as available memory (default 1)\n"); fprintf(stderr, "-C 0 to count inactive file cache memory as unavailable (default 1)\n"); fprintf(stderr, "-d for debug logging (same effect as '-l 7')\n"); fprintf(stderr, "-h to print this usage info\n"); fprintf(stderr, "-H to set THP scan_sleep_ms (default %d)\n", DEFAULT_THP_SCAN_SLEEP_MS); fprintf(stderr, "-i [:] to specify interval seconds\n"); fprintf(stderr, "-K 1 to keep interleaved memory spread across nodes (default 0)\n"); fprintf(stderr, "-K 0 to merge interleaved memory to local NUMA nodes (default 0)\n"); fprintf(stderr, "-l to specify logging level (usually 5, 6, or 7 -- default 5)\n"); fprintf(stderr, "-m to specify memory locality target percent (default %d)\n", DEFAULT_MEMLOCALITY_PERCENT); fprintf(stderr, "-p to add PID to inclusion pid list\n"); fprintf(stderr, "-r to remove PID from explicit pid lists\n"); fprintf(stderr, "-R to reserve some CPUs for non-numad use\n"); fprintf(stderr, "-S 1 to scan all processes (default 1)\n"); fprintf(stderr, "-S 0 to scan only explicit PID list processes (default 1)\n"); fprintf(stderr, "-t to specify thread / logical CPU valuation percent (default %d)\n", DEFAULT_HTT_PERCENT); fprintf(stderr, "-u to specify utilization target percent (default %d)\n", DEFAULT_UTILIZATION_PERCENT); fprintf(stderr, "-v for verbose (same effect as '-l 6')\n"); fprintf(stderr, "-V to show version info\n"); fprintf(stderr, "-w [:] for NUMA node suggestions\n"); fprintf(stderr, "-x to add PID to exclusion pid list\n"); exit(EXIT_FAILURE); } void set_thp_scan_sleep_ms(int new_ms) { if (new_ms < 1) { // 0 means do not change the system default return; } char *thp_scan_fname = "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs"; int fd = open(thp_scan_fname, O_RDWR, 0); if (fd >= 0) { char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); if (bytes > 0) { buf[bytes] = '\0'; int cur_ms; char *p = buf; CONVERT_DIGITS_TO_NUM(p, cur_ms); if (cur_ms != new_ms) { lseek(fd, 0, SEEK_SET); numad_log(LOG_NOTICE, "Changing THP scan time in %s from %d to %d ms.\n", thp_scan_fname, cur_ms, new_ms); sprintf(buf, "%d\n", new_ms); write(fd, buf, strlen(buf)); } } close(fd); } } void check_prereqs(char *prog_name) { // Adjust kernel tunable to scan for THP more frequently... set_thp_scan_sleep_ms(thp_scan_sleep_ms); } int get_daemon_pid() { int fd = open(VAR_RUN_FILE, O_RDONLY, 0); if (fd < 0) { return 0; } char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); close(fd); if (bytes <= 0) { return 0; } int pid; char *p = buf; CONVERT_DIGITS_TO_NUM(p, pid); // Check run file pid still active char fname[FNAME_SIZE]; snprintf(fname, FNAME_SIZE, "/proc/%d", pid); if (access(fname, F_OK) < 0) { if (errno == ENOENT) { numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname); unlink(VAR_RUN_FILE); } return 0; } // Daemon must be running already. return pid; } int register_numad_pid() { int pid; char buf[BUF_SIZE]; int fd; create_run_file: fd = open(VAR_RUN_FILE, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); if (fd >= 0) { pid = getpid(); sprintf(buf, "%d\n", pid); write(fd, buf, strlen(buf)); close(fd); numad_log(LOG_NOTICE, "Registering numad version %s PID %d\n", VERSION_STRING, pid); return pid; } if (errno == EEXIST) { fd = open(VAR_RUN_FILE, O_RDWR|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); if (fd < 0) { goto fail_numad_run_file; } int bytes = read(fd, buf, BUF_SIZE); close(fd); if (bytes > 0) { char *p = buf; CONVERT_DIGITS_TO_NUM(p, pid); // Check pid in run file still active char fname[FNAME_SIZE]; snprintf(fname, FNAME_SIZE, "/proc/%d", pid); if (access(fname, F_OK) < 0) { if (errno == ENOENT) { // Assume run file is out-of-date... numad_log(LOG_NOTICE, "Removing out-of-date numad run file because %s doesn't exist\n", fname); unlink(VAR_RUN_FILE); goto create_run_file; } } // Daemon must be running already. return pid; } } fail_numad_run_file: numad_log(LOG_CRIT, "Cannot open numad.pid file\n"); exit(EXIT_FAILURE); } int count_set_bits_in_hex_list_file(char *fname) { int sum = 0; int fd = open(fname, O_RDONLY, 0); if (fd >= 0) { char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); close(fd); for (int ix = 0; (ix < bytes); ix++) { char c = tolower(buf[ix]); switch (c) { case '0' : sum += 0; break; case '1' : sum += 1; break; case '2' : sum += 1; break; case '3' : sum += 2; break; case '4' : sum += 1; break; case '5' : sum += 2; break; case '6' : sum += 2; break; case '7' : sum += 3; break; case '8' : sum += 1; break; case '9' : sum += 2; break; case 'a' : sum += 2; break; case 'b' : sum += 3; break; case 'c' : sum += 2; break; case 'd' : sum += 3; break; case 'e' : sum += 3; break; case 'f' : sum += 4; break; case ' ' : sum += 0; break; case ',' : sum += 0; break; case '\n' : sum += 0; break; default : numad_log(LOG_CRIT, "Unexpected character in list\n"); exit(EXIT_FAILURE); } } } return sum; } int get_num_cpus() { int n1 = sysconf(_SC_NPROCESSORS_CONF); int n2 = sysconf(_SC_NPROCESSORS_ONLN); if (n1 < n2) { n1 = n2; } if (n1 < 0) { numad_log(LOG_CRIT, "Cannot count number of processors\n"); exit(EXIT_FAILURE); } return n1; } int get_num_kvm_vcpu_threads(int pid) { // Try to return the number of vCPU threads for this VM guest, // excluding the IO threads. All failures return MAXINT. // FIXME: someday figure out some better way to do this... char fname[FNAME_SIZE]; snprintf(fname, FNAME_SIZE, "/proc/%d/cmdline", pid); int fd = open(fname, O_RDONLY, 0); if (fd >= 0) { char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); close(fd); if (bytes > 0) { char *p = memmem(buf, bytes, "smp", 3); if (p != NULL) { while (!isdigit(*p) && (p - buf < bytes - 2)) { p++; } if (isdigit(*p)) { int vcpu_threads; CONVERT_DIGITS_TO_NUM(p, vcpu_threads); if ((vcpu_threads > 0) && (vcpu_threads <= num_cpus)) { return vcpu_threads; } } } } } return MAXINT; } uint64_t get_huge_page_size_in_bytes() { uint64_t huge_page_size = 0;; FILE *fs = fopen("/proc/meminfo", "r"); if (!fs) { numad_log(LOG_CRIT, "Can't open /proc/meminfo\n"); exit(EXIT_FAILURE); } char buf[BUF_SIZE]; while (fgets(buf, BUF_SIZE, fs)) { if (!strncmp("Hugepagesize", buf, 12)) { char *p = &buf[12]; while ((!isdigit(*p)) && (p < buf + BUF_SIZE)) { p++; } huge_page_size = atol(p); break; } } fclose(fs); return huge_page_size * KILOBYTE; } uint64_t get_time_stamp() { // Return time stamp in hundredths of a second struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) { numad_log(LOG_CRIT, "Cannot get clock_gettime()\n"); exit(EXIT_FAILURE); } return (ts.tv_sec * ONE_HUNDRED) + (ts.tv_nsec / (1000000000 / ONE_HUNDRED)); } static int name_starts_with_digit(const struct dirent *dptr) { return (isdigit(dptr->d_name[0])); } #define BITS_IN_LONG (CHAR_BIT * sizeof(unsigned long)) #define SET_BIT(i,a) (a)[(i) / BITS_IN_LONG] |= (1u << ((i) % BITS_IN_LONG)) #define TEST_BIT(i,a) (((a)[(i) / BITS_IN_LONG] & (1u << ((i) % BITS_IN_LONG))) != 0) #define CLEAR_BIT(i,a) (a)[(i) / BITS_IN_LONG] &= ~(1u << ((i) % BITS_IN_LONG)) int bind_process_and_migrate_memory(process_data_p p) { uint64_t t0 = get_time_stamp(); // Parameter p is a pointer to an element in the hash table if ((!p) || (p->pid < 1)) { numad_log(LOG_CRIT, "Bad PID to bind\n"); exit(EXIT_FAILURE); } if (!p->node_list_p) { numad_log(LOG_CRIT, "Cannot bind to unspecified node(s)\n"); exit(EXIT_FAILURE); } // Generate CPU list derived from target node list. static id_list_p cpu_bind_list_p; CLEAR_CPU_LIST(cpu_bind_list_p); int nodes = NUM_IDS_IN_LIST(p->node_list_p); int node_id = 0; while (nodes) { if (ID_IS_IN_LIST(node_id, p->node_list_p)) { OR_LISTS(cpu_bind_list_p, cpu_bind_list_p, node[node_id].cpu_list_p); nodes -= 1; } node_id += 1; } char fname[FNAME_SIZE]; struct dirent **namelist; snprintf(fname, FNAME_SIZE, "/proc/%d/task", p->pid); int num_tasks = scandir(fname, &namelist, name_starts_with_digit, NULL); if (num_tasks <= 0) { numad_log(LOG_WARNING, "Could not scandir task list for PID: %d\n", p->pid); return 0; // Assume the process terminated } // Set the affinity of each task in the process... for (int namelist_ix = 0; (namelist_ix < num_tasks); namelist_ix++) { int tid = atoi(namelist[namelist_ix]->d_name); int rc = sched_setaffinity(tid, ID_LIST_BYTES(cpu_bind_list_p), ID_LIST_SET_P(cpu_bind_list_p)); if (rc < 0) { // Check errno if (errno == ESRCH) { numad_log(LOG_WARNING, "Tried to move PID %d, TID %d, but it apparently went away.\n", p->pid, tid); } numad_log(LOG_ERR, "Bad sched_setaffinity() on PID %d, TID %d -- errno: %d\n", p->pid, tid, errno); } free(namelist[namelist_ix]); } free(namelist); // Now move the memory to the target nodes.... static unsigned long *dest_mask; static unsigned long *from_mask; static int allocated_bytes_in_masks; // Lie about num_nodes being one bigger because of kernel bug... int num_bytes_in_masks = (1 + ((num_nodes + 1) / BITS_IN_LONG)) * sizeof(unsigned long); if (allocated_bytes_in_masks < num_bytes_in_masks) { allocated_bytes_in_masks = num_bytes_in_masks; dest_mask = realloc(dest_mask, num_bytes_in_masks); from_mask = realloc(from_mask, num_bytes_in_masks); if ((dest_mask == NULL) || (from_mask == NULL)) { numad_log(LOG_CRIT, "bit mask malloc failed\n"); exit(EXIT_FAILURE); } } // In an effort to put semi-balanced memory in each target node, move the // contents from the source node with the max amount of memory to the // destination node with the least amount of memory. Repeat until done. int prev_from_node_id = -1; for (;;) { int min_dest_node_id = -1; int max_from_node_id = -1; for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { node_id = node[node_ix].node_id; if (ID_IS_IN_LIST(node_id, p->node_list_p)) { if ((min_dest_node_id < 0) || (p->process_MBs[min_dest_node_id] >= p->process_MBs[node_id])) { // The ">=" above is intentional, so we tend to move memory to higher numbered nodes min_dest_node_id = node_id; } } else { if ((max_from_node_id < 0) || (p->process_MBs[max_from_node_id] < p->process_MBs[node_id])) { max_from_node_id = node_id; } } } if ((p->process_MBs[max_from_node_id] == 0) || (max_from_node_id == prev_from_node_id)) { break; } memset(dest_mask, 0, num_bytes_in_masks); memset(from_mask, 0, num_bytes_in_masks); SET_BIT(max_from_node_id, from_mask); SET_BIT(min_dest_node_id, dest_mask); numad_log(LOG_DEBUG, "Moving memory from node: %d to node %d\n", max_from_node_id, min_dest_node_id); // Lie about num_nodes being one bigger because of kernel bug... int rc = syscall(__NR_migrate_pages, p->pid, num_nodes + 1, from_mask, dest_mask); if (rc > 2) { // rc == the number of pages that could not be moved. // A couple pages not moving is probably not a problem, hence ignoring rc == 1 or 2. numad_log(LOG_WARNING, "Tried to move PID %d, but %d pages would not move.\n", p->pid, rc); } else if (rc < 0) { // Check errno if (errno == ESRCH) { numad_log(LOG_WARNING, "Tried to move PID %d, but it apparently went away.\n", p->pid); return 0; // Assume the process terminated } } // Assume memory did move for current accounting purposes... p->process_MBs[min_dest_node_id] += p->process_MBs[max_from_node_id]; p->process_MBs[max_from_node_id] = 0; prev_from_node_id = max_from_node_id; } // Check pid still active snprintf(fname, FNAME_SIZE, "/proc/%d", p->pid); if (access(fname, F_OK) < 0) { numad_log(LOG_WARNING, "Could not migrate pid %d. Apparently it went away.\n", p->pid); return 0; } else { uint64_t t1 = get_time_stamp(); p->bind_time_stamp = t1; char node_list_str[BUF_SIZE]; str_from_id_list(node_list_str, BUF_SIZE, p->node_list_p); numad_log(LOG_NOTICE, "PID %d moved to node(s) %s in %d.%d seconds\n", p->pid, node_list_str, (t1-t0)/100, (t1-t0)%100); return 1; } } typedef struct cpu_data { uint64_t time_stamp; uint64_t *idle; } cpu_data_t, *cpu_data_p; cpu_data_t cpu_data_buf[2]; // Two sets, to calc deltas int cur_cpu_data_buf = 0; void update_cpu_data() { // Parse idle percents from CPU stats in /proc/stat cpu lines static FILE *fs; if (fs != NULL) { rewind(fs); } else { fs = fopen("/proc/stat", "r"); if (!fs) { numad_log(LOG_CRIT, "Cannot get /proc/stat contents\n"); exit(EXIT_FAILURE); } cpu_data_buf[0].idle = malloc(num_cpus * sizeof(uint64_t)); cpu_data_buf[1].idle = malloc(num_cpus * sizeof(uint64_t)); if ((cpu_data_buf[0].idle == NULL) || (cpu_data_buf[1].idle == NULL)) { numad_log(LOG_CRIT, "cpu_data_buf malloc failed\n"); exit(EXIT_FAILURE); } } // Use the other cpu_data buffer... int new = 1 - cur_cpu_data_buf; // First get the current time stamp cpu_data_buf[new].time_stamp = get_time_stamp(); // Now pull the idle stat from each cpu line char buf[BUF_SIZE]; while (fgets(buf, BUF_SIZE, fs)) { /* * Lines are of the form: * * cpu user nice system idle iowait irq softirq steal guest guest_nice * * # cat /proc/stat * cpu 11105906 0 78639 3359578423 24607 151679 322319 0 0 0 * cpu0 190540 0 1071 52232942 39 7538 234039 0 0 0 * cpu1 124519 0 50 52545188 0 1443 6267 0 0 0 * cpu2 143133 0 452 52531440 36 1573 834 0 0 0 * . . . . */ if ( (buf[0] == 'c') && (buf[1] == 'p') && (buf[2] == 'u') && (isdigit(buf[3])) ) { char *p = &buf[3]; int cpu_id = *p++ - '0'; while (isdigit(*p)) { cpu_id *= 10; cpu_id += (*p++ - '0'); } while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip user while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip nice while (!isdigit(*p)) { p++; } while (isdigit(*p)) { p++; } // skip system while (!isdigit(*p)) { p++; } uint64_t idle; CONVERT_DIGITS_TO_NUM(p, idle); cpu_data_buf[new].idle[cpu_id] = idle; } } cur_cpu_data_buf = new; } int node_and_digits(const struct dirent *dptr) { char *p = (char *)(dptr->d_name); if (*p++ != 'n') return 0; if (*p++ != 'o') return 0; if (*p++ != 'd') return 0; if (*p++ != 'e') return 0; do { if (!isdigit(*p++)) return 0; } while (*p != '\0'); return 1; } uint64_t node_info_time_stamp = 0; id_list_p all_cpus_list_p = NULL; id_list_p all_nodes_list_p = NULL; id_list_p reserved_cpu_mask_list_p = NULL; char *reserved_cpu_str = NULL; void show_nodes() { fprintf(log_fs, "\n"); numad_log(LOG_INFO, "Nodes: %d\n", num_nodes); fprintf(log_fs, "Min CPUs free: %ld, Max CPUs: %ld, Avg CPUs: %ld, StdDev: %lg\n", min_node_CPUs_free, max_node_CPUs_free, avg_node_CPUs_free, stddev_node_CPUs_free); fprintf(log_fs, "Min MBs free: %ld, Max MBs: %ld, Avg MBs: %ld, StdDev: %lg\n", min_node_MBs_free, max_node_MBs_free, avg_node_MBs_free, stddev_node_MBs_free); for (int ix = 0; (ix < num_nodes); ix++) { fprintf(log_fs, "Node %d: MBs_total %ld, MBs_free %6ld, CPUs_total %ld, CPUs_free %4ld, Distance: ", ix, node[ix].MBs_total, node[ix].MBs_free, node[ix].CPUs_total, node[ix].CPUs_free); for (int d = 0; (d < num_nodes); d++) { fprintf(log_fs, "%d ", node[ix].distance[d]); } char buf[BUF_SIZE]; str_from_id_list(buf, BUF_SIZE, node[ix].cpu_list_p); fprintf(log_fs, " CPUs: %s\n", buf); } fflush(log_fs); } int update_nodes() { char fname[FNAME_SIZE]; char buf[BIG_BUF_SIZE]; // First, check to see if we should refresh basic node info that probably never changes... uint64_t time_stamp = get_time_stamp(); #define STATIC_NODE_INFO_DELAY (600 * ONE_HUNDRED) if ((num_nodes == 0) || (node_info_time_stamp + STATIC_NODE_INFO_DELAY < time_stamp)) { node_info_time_stamp = time_stamp; // Count directory names of the form: /sys/devices/system/node/node struct dirent **namelist; int num_files = scandir ("/sys/devices/system/node", &namelist, node_and_digits, NULL); if (num_files < 1) { numad_log(LOG_CRIT, "Could not get NUMA node info\n"); exit(EXIT_FAILURE); } int need_to_realloc = (num_files != num_nodes); if (need_to_realloc) { for (int ix = num_files; (ix < num_nodes); ix++) { // If new < old, free old node_data pointers free(node[ix].distance); FREE_LIST(node[ix].cpu_list_p); } node = realloc(node, (num_files * sizeof(node_data_t))); if (node == NULL) { numad_log(LOG_CRIT, "node realloc failed\n"); exit(EXIT_FAILURE); } for (int ix = num_nodes; (ix < num_files); ix++) { // If new > old, nullify new node_data pointers node[ix].distance = NULL; node[ix].cpu_list_p = NULL; } num_nodes = num_files; } sum_CPUs_total = 0; CLEAR_CPU_LIST(all_cpus_list_p); CLEAR_NODE_LIST(all_nodes_list_p); // Figure out how many threads per core there are (for later discounting of hyper-threads) threads_per_core = count_set_bits_in_hex_list_file("/sys/devices/system/cpu/cpu0/topology/thread_siblings"); if (threads_per_core < 1) { numad_log(LOG_CRIT, "Could not count threads per core\n"); exit(EXIT_FAILURE); } // For each "node" filename present, save in node[ix].node_id // Note that the node id might not necessarily match the node ix. // Also populate the cpu lists and distance vectors for this node. for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { int node_id; char *p = &namelist[node_ix]->d_name[4]; CONVERT_DIGITS_TO_NUM(p, node_id); free(namelist[node_ix]); node[node_ix].node_id = node_id; ADD_ID_TO_LIST(node_id, all_nodes_list_p); // Get all the CPU IDs in this node... Read lines from node/cpulist // file, and set the corresponding bits in the node cpu list. snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/cpulist", node_id); int fd = open(fname, O_RDONLY, 0); if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { buf[BIG_BUF_SIZE - 1] = '\0'; // get cpulist from the cpulist string CLEAR_CPU_LIST(node[node_ix].cpu_list_p); int n = add_ids_to_list_from_str(node[node_ix].cpu_list_p, buf); if (reserved_cpu_str != NULL) { AND_LISTS(node[node_ix].cpu_list_p, node[node_ix].cpu_list_p, reserved_cpu_mask_list_p); n = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); } OR_LISTS(all_cpus_list_p, all_cpus_list_p, node[node_ix].cpu_list_p); // Calculate total CPUs, but possibly discount hyper-threads if ((threads_per_core == 1) || (htt_percent >= 100)) { node[node_ix].CPUs_total = n * ONE_HUNDRED; } else { n /= threads_per_core; node[node_ix].CPUs_total = n * ONE_HUNDRED; node[node_ix].CPUs_total += n * (threads_per_core - 1) * htt_percent; } sum_CPUs_total += node[node_ix].CPUs_total; close(fd); } else { numad_log(LOG_CRIT, "Could not get node cpu list\n"); exit(EXIT_FAILURE); } // Get distance vector of ACPI SLIT data from node/distance file if (need_to_realloc) { node[node_ix].distance = realloc(node[node_ix].distance, (num_nodes * sizeof(uint8_t))); if (node[node_ix].distance == NULL) { numad_log(LOG_CRIT, "node distance realloc failed\n"); exit(EXIT_FAILURE); } } snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/distance", node_id); fd = open(fname, O_RDONLY, 0); if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { int rnode = 0; for (char *p = buf; (*p != '\n'); ) { int lat; CONVERT_DIGITS_TO_NUM(p, lat); node[node_ix].distance[rnode++] = lat; while (*p == ' ') { p++; } } close(fd); } else { numad_log(LOG_CRIT, "Could not get node distance data\n"); exit(EXIT_FAILURE); } } free(namelist); } // Second, update the dynamic free memory and available CPU capacity while (cpu_data_buf[cur_cpu_data_buf].time_stamp + 7 >= time_stamp) { // Make sure at least 7/100 of a second has passed. // Otherwise sleep for 1/10 second. struct timespec ts = { 0, 100000000 }; nanosleep(&ts, &ts); time_stamp = get_time_stamp(); } update_cpu_data(); max_node_MBs_free = 0; max_node_CPUs_free = 0; min_node_MBs_free = MAXINT; min_node_CPUs_free = MAXINT; uint64_t sum_of_node_MBs_free = 0; uint64_t sum_of_node_CPUs_free = 0; for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { int node_id = node[node_ix].node_id; // Get available memory info from node/meminfo file snprintf(fname, FNAME_SIZE, "/sys/devices/system/node/node%d/meminfo", node_id); int fd = open(fname, O_RDONLY, 0); if ((fd >= 0) && (read(fd, buf, BIG_BUF_SIZE) > 0)) { close(fd); uint64_t KB; buf[BIG_BUF_SIZE - 1] = '\0'; char *p = strstr(buf, "MemTotal:"); if (p != NULL) { p += 9; } else { numad_log(LOG_CRIT, "Could not get node MemTotal\n"); exit(EXIT_FAILURE); } while (!isdigit(*p)) { p++; } CONVERT_DIGITS_TO_NUM(p, KB); node[node_ix].MBs_total = (KB / KILOBYTE); if (node[node_ix].MBs_total < 1) { // If a node has zero memory, remove it from the all_nodes_list... CLR_ID_IN_LIST(node_id, all_nodes_list_p); } p = strstr(p, "MemFree:"); if (p != NULL) { p += 8; } else { numad_log(LOG_CRIT, "Could not get node MemFree\n"); exit(EXIT_FAILURE); } while (!isdigit(*p)) { p++; } CONVERT_DIGITS_TO_NUM(p, KB); node[node_ix].MBs_free = (KB / KILOBYTE); if (use_inactive_file_cache) { // Add inactive file cache quantity to "free" memory p = strstr(p, "Inactive(file):"); if (p != NULL) { p += 15; } else { numad_log(LOG_CRIT, "Could not get node Inactive(file)\n"); exit(EXIT_FAILURE); } while (!isdigit(*p)) { p++; } CONVERT_DIGITS_TO_NUM(p, KB); node[node_ix].MBs_free += (KB / KILOBYTE); } sum_of_node_MBs_free += node[node_ix].MBs_free; if (min_node_MBs_free > node[node_ix].MBs_free) { min_node_MBs_free = node[node_ix].MBs_free; min_node_MBs_free_ix = node[node_ix].node_id; } if (max_node_MBs_free < node[node_ix].MBs_free) { max_node_MBs_free = node[node_ix].MBs_free; } } else { numad_log(LOG_CRIT, "Could not get node meminfo\n"); exit(EXIT_FAILURE); } // If both buffers have been populated by now, sum CPU idle data // for each node in order to calculate available capacity int old_cpu_data_buf = 1 - cur_cpu_data_buf; if (cpu_data_buf[old_cpu_data_buf].time_stamp > 0) { uint64_t idle_ticks = 0; int cpu = 0; int num_lcpus = NUM_IDS_IN_LIST(node[node_ix].cpu_list_p); int num_cpus_to_process = num_lcpus; while (num_cpus_to_process) { if (ID_IS_IN_LIST(cpu, node[node_ix].cpu_list_p)) { idle_ticks += cpu_data_buf[cur_cpu_data_buf].idle[cpu] - cpu_data_buf[old_cpu_data_buf].idle[cpu]; num_cpus_to_process -= 1; } cpu += 1; } uint64_t time_diff = cpu_data_buf[cur_cpu_data_buf].time_stamp - cpu_data_buf[old_cpu_data_buf].time_stamp; // printf("Node: %d CPUs: %ld time diff %ld Idle ticks %ld\n", node_id, node[node_ix].CPUs_total, time_diff, idle_ticks); // assert(time_diff > 0); node[node_ix].CPUs_free = (idle_ticks * ONE_HUNDRED) / time_diff; // Possibly discount hyper-threads if ((threads_per_core > 1) && (htt_percent < 100)) { uint64_t htt_discount = (num_lcpus - (num_lcpus / threads_per_core)) * (100 - htt_percent); if (node[node_ix].CPUs_free > htt_discount) { node[node_ix].CPUs_free -= htt_discount; } else { node[node_ix].CPUs_free = 0; } } if (node[node_ix].CPUs_free > node[node_ix].CPUs_total) { node[node_ix].CPUs_free = node[node_ix].CPUs_total; } sum_of_node_CPUs_free += node[node_ix].CPUs_free; if (min_node_CPUs_free > node[node_ix].CPUs_free) { min_node_CPUs_free = node[node_ix].CPUs_free; min_node_CPUs_free_ix = node[node_ix].node_id; } if (max_node_CPUs_free < node[node_ix].CPUs_free) { max_node_CPUs_free = node[node_ix].CPUs_free; } node[node_ix].magnitude = node[node_ix].CPUs_free * node[node_ix].MBs_free; } else { node[node_ix].CPUs_free = 0; node[node_ix].magnitude = 0; } } avg_node_MBs_free = sum_of_node_MBs_free / num_nodes; avg_node_CPUs_free = sum_of_node_CPUs_free / num_nodes; double MBs_variance_sum = 0.0; double CPUs_variance_sum = 0.0; for (int node_ix = 0; (node_ix < num_nodes); node_ix++) { double MBs_diff = (double)node[node_ix].MBs_free - (double)avg_node_MBs_free; double CPUs_diff = (double)node[node_ix].CPUs_free - (double)avg_node_CPUs_free; MBs_variance_sum += MBs_diff * MBs_diff; CPUs_variance_sum += CPUs_diff * CPUs_diff; } double MBs_variance = MBs_variance_sum / (num_nodes); double CPUs_variance = CPUs_variance_sum / (num_nodes); stddev_node_MBs_free = sqrt(MBs_variance); stddev_node_CPUs_free = sqrt(CPUs_variance); if (log_level >= LOG_INFO) { show_nodes(); } return num_nodes; } typedef struct stat_data { // This structure isn't actually used in numad -- it is here just to // document the field type and order of the /proc//stat items, some of // which are used in the process_data_t structure. int pid; // 0 char *comm; // 1 char state; int ppid; int pgrp; int session; int tty_nr; int tpgid; unsigned flags; uint64_t minflt; uint64_t cminflt; uint64_t majflt; uint64_t cmajflt; uint64_t utime; // 13 uint64_t stime; // 14 int64_t cutime; int64_t cstime; int64_t priority; // 17 int64_t nice; int64_t num_threads; // 19 int64_t itrealvalue; uint64_t starttime; uint64_t vsize; // 22 int64_t rss; // 23 uint64_t rsslim; uint64_t startcode; uint64_t endcode; uint64_t startstack; uint64_t kstkesp; uint64_t kstkeip; uint64_t signal; uint64_t blocked; uint64_t sigignore; uint64_t sigcatch; uint64_t wchan; uint64_t nswap; uint64_t cnswap; int exit_signal; int processor; unsigned rt_priority; unsigned policy; // 40 uint64_t delayacct_blkio_ticks; uint64_t guest_time; // 42 int64_t cguest_time; } stat_data_t, *stat_data_p; process_data_p get_stat_data_for_pid(int pid, char *pid_string) { // Note: This function uses static data buffers and is not thread safe. char fname[FNAME_SIZE]; if (pid >= 0) { snprintf(fname, FNAME_SIZE, "/proc/%d/stat", pid); } else { snprintf(fname, FNAME_SIZE, "/proc/%s/stat", pid_string); } int fd = open(fname, O_RDONLY, 0); if (fd < 0) { numad_log(LOG_WARNING, "Could not open stat file: %s\n", fname); return NULL; } static char buf[BUF_SIZE]; int bytes = read(fd, buf, BUF_SIZE); close(fd); if (bytes < 50) { numad_log(LOG_WARNING, "Could not read stat file: %s\n", fname); return NULL; } uint64_t val; char *p = buf; static process_data_t data; // Get PID from field 0 CONVERT_DIGITS_TO_NUM(p, val); data.pid = val; // Copy comm from field 1 while (*p == ' ') { p++; } data.comm = p; while (*p != ' ') { p++; } *p++ = '\0'; // replacing the presumed single ' ' before next field // Skip fields 2 through 12 for (int ix = 0; (ix < 11); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } // Get utime from field 13 for cpu_util CONVERT_DIGITS_TO_NUM(p, val); data.cpu_util = val; // Get stime from field 14 to add on to cpu_util (which already has utime) while (*p == ' ') { p++; } CONVERT_DIGITS_TO_NUM(p, val); data.cpu_util += val; // Skip fields 15 through 18 while (*p == ' ') { p++; } for (int ix = 0; (ix < 4); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } // Get num_threads from field 19 CONVERT_DIGITS_TO_NUM(p, val); data.num_threads = val; // Skip fields 20 through 21 while (*p == ' ') { p++; } for (int ix = 0; (ix < 2); ix++) { while (*p != ' ') { p++; } while (*p == ' ') { p++; } } // Get vsize from field 22 to compute MBs_size CONVERT_DIGITS_TO_NUM(p, val); data.MBs_size = val / MEGABYTE; // Get rss from field 23 to compute MBs_used while (*p == ' ') { p++; } CONVERT_DIGITS_TO_NUM(p, val); data.MBs_used = (val * page_size_in_bytes) / MEGABYTE; // Return pointer to data return &data; } int update_processes() { // Conditionally scan /proc//stat files for processes we should // perhaps manage. For all processes, evaluate whether or not they should // be added to our hash table of managed processes candidates. If so, // update the statistics, time stamp and utilization numbers for the select // processes in the hash table. uint64_t this_update_time = get_time_stamp(); int new_candidates = 0; // limit number of new candidates per update int files = 0; if (scan_all_processes) { struct dirent **namelist; files = scandir("/proc", &namelist, name_starts_with_digit, NULL); if (files < 0) { numad_log(LOG_CRIT, "Could not open /proc\n"); exit(EXIT_FAILURE); } for (int ix = 0; (ix < files); ix++) { process_data_p data_p; if ((data_p = get_stat_data_for_pid(-1, namelist[ix]->d_name)) != NULL) { // See if this process uses enough memory to be managed. if ((data_p->MBs_used > MEMORY_THRESHOLD) && (new_candidates < process_hash_table_size / 3)) { data_p->data_time_stamp = get_time_stamp(); new_candidates += process_hash_update(data_p); } } free(namelist[ix]); } free(namelist); } // scan_all_processes // Process explicit inclusion and exclusion pid lists pthread_mutex_lock(&pid_list_mutex); // Include candidate processes from the explicit include pid list pid_list_p pid_ptr = include_pid_list; while ((pid_ptr != NULL) && (new_candidates < process_hash_table_size / 3)) { int hash_ix = process_hash_lookup(pid_ptr->pid); if ( (hash_ix >= 0) && (process_hash_table[hash_ix].data_time_stamp > this_update_time)) { // Already in hash table, and recently updated... pid_ptr = pid_ptr->next; continue; } process_data_p data_p; if ((data_p = get_stat_data_for_pid(pid_ptr->pid, NULL)) != NULL) { data_p->data_time_stamp = get_time_stamp(); new_candidates += process_hash_update(data_p); if (!scan_all_processes) { files += 1; } pid_ptr = pid_ptr->next; } else { // no stat file so assume pid dead -- remove it from pid list include_pid_list = remove_pid_from_pid_list(include_pid_list, pid_ptr->pid); pid_ptr = include_pid_list; // just restart from list beginning continue; } } // Zero CPU utilization for processes on the explicit exclude pid list pid_ptr = exclude_pid_list; while (pid_ptr != NULL) { int hash_ix = process_hash_lookup(pid_ptr->pid); if (hash_ix >= 0) { process_hash_table[hash_ix].CPUs_used = 0; } pid_ptr = pid_ptr->next; } pthread_mutex_unlock(&pid_list_mutex); if (log_level >= LOG_INFO) { numad_log(LOG_INFO, "Processes: %d\n", files); } // Now go through all managed processes to cleanup out-of-date and dead ones. process_hash_table_cleanup(this_update_time); return files; } int initialize_mem_node_list(process_data_p p) { // Parameter p is a pointer to an element in the hash table if ((!p) || (p->pid < 1)) { numad_log(LOG_CRIT, "Cannot initialize mem node lists with bad PID\n"); exit(EXIT_FAILURE); } int n = 0; char fname[FNAME_SIZE]; char buf[BIG_BUF_SIZE]; p->process_MBs = NULL; CLEAR_NODE_LIST(p->node_list_p); snprintf(fname, FNAME_SIZE, "/proc/%d/status", p->pid); int fd = open(fname, O_RDONLY, 0); if (fd < 0) { numad_log(LOG_WARNING, "Tried to research PID %d, but it apparently went away.\n", p->pid); return 0; // Assume the process terminated } int bytes = read(fd, buf, BIG_BUF_SIZE); close(fd); if (bytes <= 0) { numad_log(LOG_WARNING, "Tried to research PID %d, but cannot read status file.\n", p->pid); return 0; // Assume the process terminated } else if (bytes >= BIG_BUF_SIZE) { buf[BIG_BUF_SIZE - 1] = '\0'; } else { buf[bytes] = '\0'; } char *list_str_p = strstr(buf, "Mems_allowed_list:"); if (!list_str_p) { numad_log(LOG_CRIT, "Could not get node Mems_allowed_list\n"); exit(EXIT_FAILURE); } list_str_p += 18; while (!isdigit(*list_str_p)) { list_str_p++; } n = add_ids_to_list_from_str(p->node_list_p, list_str_p); if (n < num_nodes) { // If process already bound to a subset of nodes when we discover it, // set initial bind_time_stamp to 30 minutes ago... p->bind_time_stamp = get_time_stamp() - (1800 * ONE_HUNDRED); } return n; } uint64_t combined_value_of_weighted_resources(int ix, int mbs, int cpus, uint64_t MBs_free, uint64_t CPUs_free) { int64_t needed_mem; int64_t needed_cpu; int64_t excess_mem; int64_t excess_cpu; if (MBs_free > mbs) { needed_mem = mbs; excess_mem = MBs_free - mbs; } else { needed_mem = MBs_free; excess_mem = 0; } if (CPUs_free > cpus) { needed_cpu = cpus; excess_cpu = CPUs_free - cpus; } else { needed_cpu = CPUs_free; excess_cpu = 0; } // Weight the available resources, and then calculate magnitude as // product of available CPUs and available MBs. int64_t memfactor = (needed_mem * 10 + excess_mem * 4); int64_t cpufactor = (needed_cpu * 6 + excess_cpu * 1); numad_log(LOG_DEBUG, " Node[%d]: mem: %ld cpu: %ld\n", ix, memfactor, cpufactor); return (memfactor * cpufactor); } id_list_p pick_numa_nodes(int pid, int cpus, int mbs, int assume_enough_cpus) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "PICK NODES FOR: PID: %d, CPUs %d, MBs %d\n", pid, cpus, mbs); } char buf[BUF_SIZE]; uint64_t proc_avg_node_CPUs_free = 0; // For existing processes, get miscellaneous process specific details int pid_ix; process_data_p p = NULL; if ((pid > 0) && ((pid_ix = process_hash_lookup(pid)) >= 0)) { p = &process_hash_table[pid_ix]; // Add up per-node memory in use by this process. // This scanning is expensive and should be minimized. char fname[FNAME_SIZE]; snprintf(fname, FNAME_SIZE, "/proc/%d/numa_maps", pid); FILE *fs = fopen(fname, "r"); if (!fs) { numad_log(LOG_WARNING, "Tried to research PID %d numamaps, but it apparently went away.\n", p->pid); return NULL; // Assume the process terminated } // Allocate and zero per node memory array. // The "+1 node" is for accumulating interleaved memory p->process_MBs = realloc(p->process_MBs, (num_nodes + 1) * sizeof(uint64_t)); if (p->process_MBs == NULL) { numad_log(LOG_CRIT, "p->process_MBs realloc failed\n"); exit(EXIT_FAILURE); } memset(p->process_MBs, 0, (num_nodes + 1) * sizeof(uint64_t)); int process_has_interleaved_memory = 0; while (fgets(buf, BUF_SIZE, fs)) { int interleaved_memory = 0; uint64_t page_size = page_size_in_bytes; const char *delimiters = " \n"; char *str_p = strtok(buf, delimiters); while (str_p) { if (!strncmp(str_p, "interleave", 10)) { interleaved_memory = 1; process_has_interleaved_memory = 1; } else if (!strcmp(str_p, "huge")) { page_size = huge_page_size_in_bytes; } else if (*str_p++ == 'N') { int node; uint64_t pages; CONVERT_DIGITS_TO_NUM(str_p, node); if (*str_p++ != '=') { numad_log(LOG_CRIT, "numa_maps node number parse error\n"); exit(EXIT_FAILURE); } CONVERT_DIGITS_TO_NUM(str_p, pages); p->process_MBs[node] += (pages * page_size); if (interleaved_memory) { // sum interleaved quantity in "extra node" p->process_MBs[num_nodes] += (pages * page_size); } } // Get next token on the line str_p = strtok(NULL, delimiters); } } fclose(fs); proc_avg_node_CPUs_free = p->CPUs_used; for (int ix = 0; (ix <= num_nodes); ix++) { p->process_MBs[ix] /= MEGABYTE; if ((log_level >= LOG_DEBUG) && (p->process_MBs[ix] > 0)) { if (ix == num_nodes) { numad_log(LOG_DEBUG, "Interleaved MBs: %ld\n", ix, p->process_MBs[ix]); } else { numad_log(LOG_DEBUG, "PROCESS_MBs[%d]: %ld\n", ix, p->process_MBs[ix]); } } if (ID_IS_IN_LIST(ix, p->node_list_p)) { proc_avg_node_CPUs_free += node[ix].CPUs_free; } } proc_avg_node_CPUs_free /= NUM_IDS_IN_LIST(p->node_list_p); if ((process_has_interleaved_memory) && (keep_interleaved_memory)) { // Mark this process as having interleaved memory so we do not // merge the interleaved memory. Time stamp it as done and return. p->flags |= PROCESS_FLAG_INTERLEAVED; p->bind_time_stamp = get_time_stamp(); if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); } return NULL; } } // end of existing PID conditional // Make a copy of node available resources array. Add in info specific to // this process to equalize available resource quantities wrt locations of // resources already in use by this process. static node_data_p tmp_node; tmp_node = realloc(tmp_node, num_nodes * sizeof(node_data_t) ); if (tmp_node == NULL) { numad_log(LOG_CRIT, "tmp_node realloc failed\n"); exit(EXIT_FAILURE); } memcpy(tmp_node, node, num_nodes * sizeof(node_data_t) ); uint64_t sum_of_node_CPUs_free = 0; for (int ix = 0; (ix < num_nodes); ix++) { if (pid > 0) { if (NUM_IDS_IN_LIST(p->node_list_p) >= num_nodes) { // Process not yet bound to a subset of nodes. // Add back memory used by this process on this node. tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 17) / 16); // Apply light mem bias // Add back CPU used by this process in proportion to the memory used on this node. tmp_node[ix].CPUs_free += ((p->CPUs_used * p->process_MBs[ix]) / p->MBs_used); } else { // If the process is currently running on less than all the // nodes, first add back (biased) memory already used by this // process on this node, then assign average process CPU / node // for this process iff the process is present on this node. tmp_node[ix].MBs_free += ((p->process_MBs[ix] * 5) / 4); // Apply heavy mem bias if (ID_IS_IN_LIST(ix, p->node_list_p)) { tmp_node[ix].CPUs_free = proc_avg_node_CPUs_free; } } sum_of_node_CPUs_free += tmp_node[ix].CPUs_free; if (tmp_node[ix].CPUs_free > tmp_node[ix].CPUs_total) { tmp_node[ix].CPUs_free = tmp_node[ix].CPUs_total; } if (tmp_node[ix].MBs_free > tmp_node[ix].MBs_total) { tmp_node[ix].MBs_free = tmp_node[ix].MBs_total; } } // Enforce 1/100th CPU minimum if (tmp_node[ix].CPUs_free < 1) { tmp_node[ix].CPUs_free = 1; } // numad_log(LOG_DEBUG, "Raw Node[%d]: mem: %ld cpu: %ld\n", ix, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); tmp_node[ix].magnitude = combined_value_of_weighted_resources(ix, mbs, cpus, tmp_node[ix].MBs_free, tmp_node[ix].CPUs_free); } // Now figure out where to get resources for this request.... static id_list_p target_node_list_p; CLEAR_NODE_LIST(target_node_list_p); if ((pid > 0) && (cpus > sum_of_node_CPUs_free)) { // System CPUs might be oversubscribed, but... assume_enough_cpus = 1; // and rely on available memory for placement. } // Establish a CPU flex fudge factor, on the presumption it is OK if not // quite all the CPU request is met. However, if trying to find resources // for pre-placement advice request, do not underestimate the amount of // CPUs needed. Instead, err on the side of providing too many resources. int cpu_flex = 0; if ((pid > 0) && (target_utilization < 100)) { // FIXME: Is half of the utilization margin a good amount of CPU flexing? cpu_flex = ((100 - target_utilization) * node[0].CPUs_total) / 200; } // Figure out minimum number of nodes required int mem_req_nodes = ceil((double)mbs / (double)node[0].MBs_total); int cpu_req_nodes = ceil((double)(cpus - cpu_flex) / (double)node[0].CPUs_total); int min_req_nodes = mem_req_nodes; if (min_req_nodes < cpu_req_nodes) { min_req_nodes = cpu_req_nodes; } if (min_req_nodes > num_nodes) { min_req_nodes = num_nodes; } // Use an index to sort NUMA connected resource chain for each node int index[num_nodes]; uint64_t totmag[num_nodes]; for (int ix = 0; (ix < num_nodes); ix++) { // Reset the index each time for (int n = 0; (n < num_nodes); n++) { index[n] = n; } // Sort by minimum relative NUMA distance from node[ix], // breaking distance ties with magnitude of available resources for (int ij = 0; (ij < num_nodes); ij++) { int best_ix = ij; for (int ik = ij + 1; (ik < num_nodes); ik++) { int ik_dist = tmp_node[index[ik]].distance[ix]; int best_ix_dist = tmp_node[index[best_ix]].distance[ix]; if (best_ix_dist > ik_dist) { best_ix = ik; } else if (best_ix_dist == ik_dist) { if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) { best_ix = ik; } } } if (best_ix != ij) { int tmp = index[ij]; index[ij] = index[best_ix]; index[best_ix] = tmp; } } #if 0 if (log_level >= LOG_DEBUG) { for (int iq = 0; (iq < num_nodes); iq++) { numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n", tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[ix], tmp_node[index[iq]].magnitude); } } #endif // Save the totmag[] sum of the magnitudes of expected needed nodes, // "normalized" by NUMA distance (by dividing each magnitude by the // relative distance squared). totmag[ix] = 0; for (int ij = 0; (ij < min_req_nodes); ij++) { int dist = tmp_node[index[ij]].distance[ix]; totmag[ix] += (tmp_node[index[ij]].magnitude / (dist * dist)); } numad_log(LOG_DEBUG, "Totmag[%d]: %ld\n", ix, totmag[ix]); } // Now find the best NUMA node based on the normalized sum of node // magnitudes expected to be used. int best_node_ix = 0; for (int ix = 0; (ix < num_nodes); ix++) { if (totmag[best_node_ix] < totmag[ix]) { best_node_ix = ix; } } numad_log(LOG_DEBUG, "best_node_ix: %d\n", best_node_ix); // Reset sorting index again for (int n = 0; (n < num_nodes); n++) { index[n] = n; } // Sort index by distance from node[best_node_ix], // breaking distance ties with magnitude for (int ij = 0; (ij < num_nodes); ij++) { int best_ix = ij; for (int ik = ij + 1; (ik < num_nodes); ik++) { int ik_dist = tmp_node[index[ik]].distance[best_node_ix]; int best_ix_dist = tmp_node[index[best_ix]].distance[best_node_ix]; if (best_ix_dist > ik_dist) { best_ix = ik; } else if (best_ix_dist == ik_dist) { if (tmp_node[index[best_ix]].magnitude < tmp_node[index[ik]].magnitude ) { best_ix = ik; } } } if (best_ix != ij) { int tmp = index[ij]; index[ij] = index[best_ix]; index[best_ix] = tmp; } } if (log_level >= LOG_DEBUG) { for (int iq = 0; (iq < num_nodes); iq++) { numad_log(LOG_DEBUG, "Node: %d Dist: %d Magnitude: %ld\n", tmp_node[index[iq]].node_id, tmp_node[index[iq]].distance[best_node_ix], tmp_node[index[iq]].magnitude); } } // Allocate more resources until request is met. best_node_ix = 0; while ((min_req_nodes > 0) || (mbs > 0) || ((cpus > cpu_flex) && (!assume_enough_cpus))) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "MBs: %d, CPUs: %d\n", mbs, cpus); } numad_log(LOG_DEBUG, "Assigning resources from node %d\n", index[best_node_ix]); ADD_ID_TO_LIST(tmp_node[index[best_node_ix]].node_id, target_node_list_p); min_req_nodes -= 1; if (EQUAL_LISTS(target_node_list_p, all_nodes_list_p)) { // Apparently we must use all resource nodes... break; } // "Consume" the resources on this node #define CPUS_MARGIN 0 #define MBS_MARGIN 100 if (tmp_node[index[best_node_ix]].MBs_free >= (mbs + MBS_MARGIN)) { tmp_node[index[best_node_ix]].MBs_free -= mbs; mbs = 0; } else { mbs -= (tmp_node[index[best_node_ix]].MBs_free - MBS_MARGIN); tmp_node[index[best_node_ix]].MBs_free = MBS_MARGIN; } if (tmp_node[index[best_node_ix]].CPUs_free >= (cpus + CPUS_MARGIN)) { tmp_node[index[best_node_ix]].CPUs_free -= cpus; cpus = 0; } else { cpus -= (tmp_node[index[best_node_ix]].CPUs_free - CPUS_MARGIN); tmp_node[index[best_node_ix]].CPUs_free = CPUS_MARGIN; } // Next line optional, since we will not look at that node again tmp_node[index[best_node_ix]].magnitude = combined_value_of_weighted_resources(0, mbs, cpus, tmp_node[index[best_node_ix]].MBs_free, tmp_node[index[best_node_ix]].CPUs_free); best_node_ix += 1; } // For existing processes, calculate the non-local memory percent to see if // process is already in the right place. if ((pid > 0) && (p != NULL)) { uint64_t nonlocal_memory = 0; for (int ix = 0; (ix < num_nodes); ix++) { if (!ID_IS_IN_LIST(ix, target_node_list_p)) { // Accumulate total of nonlocal memory nonlocal_memory += p->process_MBs[ix]; } } int disp_percent = (100 * nonlocal_memory) / p->MBs_used; // If this existing process is already located where we want it, then just // return NULL indicating no need to change binding this time. Check the // ammount of nonlocal memory against the target_memlocality_perecent. if ((disp_percent <= (100 - target_memlocality)) && (p->bind_time_stamp) && (EQUAL_LISTS(target_node_list_p, p->node_list_p))) { // Already bound to targets, and enough of the memory is located where we want it, so no need to rebind if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "Process %d already %d percent localized to target nodes.\n", p->pid, 100 - disp_percent); } p->bind_time_stamp = get_time_stamp(); return NULL; } } // Must always provide at least one node for pre-placement advice // FIXME: verify this can happen only if no resources requested... if ((pid <= 0) && (NUM_IDS_IN_LIST(target_node_list_p) <= 0)) { ADD_ID_TO_LIST(node[0].node_id, target_node_list_p); } // Log advice, and return target node list if ((pid > 0) && (p->bind_time_stamp)) { str_from_id_list(buf, BUF_SIZE, p->node_list_p); } else { str_from_id_list(buf, BUF_SIZE, all_nodes_list_p); } char buf2[BUF_SIZE]; str_from_id_list(buf2, BUF_SIZE, target_node_list_p); char *cmd_name = "(unknown)"; if ((p) && (p->comm)) { cmd_name = p->comm; } numad_log(LOG_NOTICE, "Advising pid %d %s move from nodes (%s) to nodes (%s)\n", pid, cmd_name, buf, buf2); if (pid > 0) { COPY_LIST(target_node_list_p, p->node_list_p); } return target_node_list_p; } int manage_loads() { uint64_t time_stamp = get_time_stamp(); // Use temporary index to access and sort hash table entries static int pindex_size; static process_data_p *pindex; if (pindex_size < process_hash_table_size) { pindex_size = process_hash_table_size; pindex = realloc(pindex, pindex_size * sizeof(process_data_p)); if (pindex == NULL) { numad_log(LOG_CRIT, "pindex realloc failed\n"); exit(EXIT_FAILURE); } // Quick round trip whenever we resize the hash table. // This is mostly to avoid max_interval wait at start up. return min_interval / 2; } memset(pindex, 0, pindex_size * sizeof(process_data_p)); // Copy live candidate pointers to the index for sorting // if they meet the threshold for memory usage and CPU usage. int nprocs = 0; long sum_CPUs_used = 0; for (int ix = 0; (ix < process_hash_table_size); ix++) { process_data_p p = &process_hash_table[ix]; if ((p->pid) && (p->CPUs_used > CPU_THRESHOLD) && (p->MBs_used > MEMORY_THRESHOLD)) { pindex[nprocs++] = p; sum_CPUs_used += p->CPUs_used; // Initialize node list, if not already done for this process. if (p->node_list_p == NULL) { initialize_mem_node_list(p); } } } // Order candidate considerations using timestamps and magnitude: amount of // CPU used * amount of memory used. Not expecting a long list here. Use // a simplistic sort -- however move all not yet bound to front of list and // order by decreasing magnitude. Previously bound processes follow in // bins of increasing magnitude treating values within 20% as aquivalent. // Within bins, order by bind_time_stamp so oldest bound will be higher // priority to evaluate. Start by moving all unbound to beginning. int num_unbound = 0; for (int ij = 0; (ij < nprocs); ij++) { if (pindex[ij]->bind_time_stamp == 0) { process_data_p tmp = pindex[num_unbound]; pindex[num_unbound++] = pindex[ij]; pindex[ij] = tmp; } } // Sort all unbound so biggest magnitude comes first for (int ij = 0; (ij < num_unbound); ij++) { int best = ij; for (int ik = ij + 1; (ik < num_unbound); ik++) { uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used); if (ik_mag <= best_mag) continue; best = ik; } if (best != ij) { process_data_p tmp = pindex[ij]; pindex[ij] = pindex[best]; pindex[best] = tmp; } } // Sort the remaining candidates into bins of increasting magnitude, and by // timestamp within bins. for (int ij = num_unbound; (ij < nprocs); ij++) { int best = ij; for (int ik = ij + 1; (ik < nprocs); ik++) { uint64_t ik_mag = (pindex[ ik]->CPUs_used * pindex[ ik]->MBs_used); uint64_t best_mag = (pindex[best]->CPUs_used * pindex[best]->MBs_used); uint64_t min_mag = ik_mag; uint64_t diff_mag = best_mag - ik_mag; if (diff_mag < 0) { diff_mag = -(diff_mag); min_mag = best_mag; } if ((diff_mag > 0) && (min_mag / diff_mag < 5)) { // difference > 20 percent. Use magnitude ordering if (ik_mag <= best_mag) continue; } else { // difference within 20 percent. Sort these by bind_time_stamp. if (pindex[ik]->bind_time_stamp > pindex[best]->bind_time_stamp) continue; } best = ik; } if (best != ij) { process_data_p tmp = pindex[ij]; pindex[ij] = pindex[best]; pindex[best] = tmp; } } // Show the candidate processes in the log file if ((log_level >= LOG_INFO) && (nprocs > 0)) { numad_log(LOG_INFO, "Candidates: %d\n", nprocs); for (int ix = 0; (ix < nprocs); ix++) { process_data_p p = pindex[ix]; char buf[BUF_SIZE]; str_from_id_list(buf, BUF_SIZE, p->node_list_p); fprintf(log_fs, "%ld: PID %d: %s, Threads %2ld, MBs_size %6ld, MBs_used %6ld, CPUs_used %4ld, Magnitude %6ld, Nodes: %s\n", p->data_time_stamp, p->pid, p->comm, p->num_threads, p->MBs_size, p->MBs_used, p->CPUs_used, p->MBs_used * p->CPUs_used, buf); } fflush(log_fs); } // Estimate desired size (+ margin capacity) and // make resource requests for each candidate process for (int ix = 0; (ix < nprocs); ix++) { process_data_p p = pindex[ix]; // If this process has interleaved memory, recheck it only every 30 minutes... #define MIN_DELAY_FOR_INTERLEAVE (1800 * ONE_HUNDRED) if (((p->flags & PROCESS_FLAG_INTERLEAVED) > 0) && (p->bind_time_stamp + MIN_DELAY_FOR_INTERLEAVE > time_stamp)) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because of interleaved memory.\n", p->pid); } continue; } // Expand resources needed estimate using target_utilization factor. // Start with the CPUs actually used (capped by number of threads) for // CPUs required, and the RSS MBs actually used for the MBs // requirement, int mem_target_utilization = target_utilization; int cpu_target_utilization = target_utilization; // Cap memory utilization at 100 percent (but allow CPUs to oversubscribe) if (mem_target_utilization > 100) { mem_target_utilization = 100; } // If the process virtual memory size is bigger than one node, and it // is already using more than 80 percent of a node, then request MBs // based on the virtual size rather than on the current amount in use. int mb_request; if ((p->MBs_size > node[0].MBs_total) && ((p->MBs_used * 5 / 4) > node[0].MBs_total)) { mb_request = (p->MBs_size * 100) / mem_target_utilization; } else { mb_request = (p->MBs_used * 100) / mem_target_utilization; } int cpu_request = (p->CPUs_used * 100) / cpu_target_utilization; // But do not give a process more CPUs than it has threads! int thread_limit = p->num_threads; // If process looks like a KVM guest, try to limit thread count to the // number of vCPU threads. FIXME: Will need to do something more // intelligent than this with guest IO threads when eventually // considering devices and IRQs. if ((p->comm) && (p->comm[0] == '(') && (p->comm[1] == 'q') && (strcmp(p->comm, "(qemu-kvm)") == 0)) { int kvm_vcpu_threads = get_num_kvm_vcpu_threads(p->pid); if (thread_limit > kvm_vcpu_threads) { thread_limit = kvm_vcpu_threads; } } thread_limit *= ONE_HUNDRED; if (cpu_request > thread_limit) { cpu_request = thread_limit; } // If this process was recently bound, enforce a five-minute minimum // delay between repeated attempts to potentially move the process. #define MIN_DELAY_FOR_REEVALUATION (300 * ONE_HUNDRED) if (p->bind_time_stamp + MIN_DELAY_FOR_REEVALUATION > time_stamp) { // Skip re-evaluation because we just did it recently, but check // first for node utilization balance to see if we should // re-evaluate this particular process right now. If this process // is running on one of the busiest nodes, go ahead and re-evaluate // it if it looks like it should have a better place with // sufficient resources. FIXME: this is currently implemented for // only smallish processes that will fit in a single node. if ( ( ID_IS_IN_LIST(min_node_CPUs_free_ix, p->node_list_p) || ID_IS_IN_LIST(min_node_MBs_free_ix, p->node_list_p)) && (cpu_request < node[0].CPUs_total) && (mb_request < node[0].MBs_total) && (abs(min_node_CPUs_free + p->CPUs_used - avg_node_CPUs_free) + abs((max_node_CPUs_free - p->CPUs_used) - avg_node_CPUs_free) < (max_node_CPUs_free - min_node_CPUs_free) - CPU_THRESHOLD) // CPU slop && (abs(min_node_MBs_free + p->MBs_used - avg_node_MBs_free) + abs((max_node_MBs_free - p->MBs_used) - avg_node_MBs_free) < (max_node_MBs_free - min_node_MBs_free)) ) { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "Bypassing delay for %d because it looks like it can do better.\n", p->pid); } } else { if (log_level >= LOG_DEBUG) { numad_log(LOG_DEBUG, "Skipping evaluation of PID %d because done too recently.\n", p->pid); } continue; } } // OK, now pick NUMA nodes for this process and bind it! pthread_mutex_lock(&node_info_mutex); int assume_enough_cpus = (sum_CPUs_used <= sum_CPUs_total); id_list_p node_list_p = pick_numa_nodes(p->pid, cpu_request, mb_request, assume_enough_cpus); if ((node_list_p != NULL) && (bind_process_and_migrate_memory(p))) { pthread_mutex_unlock(&node_info_mutex); // Return minimum interval when actively moving processes return min_interval; } pthread_mutex_unlock(&node_info_mutex); } // Return maximum interval when no process movement return max_interval; } void *set_dynamic_options(void *arg) { // int arg_value = *(int *)arg; char buf[BUF_SIZE]; for (;;) { // Loop here forever waiting for a msg to do something... msg_t msg; recv_msg(&msg); switch (msg.body.cmd) { case 'C': use_inactive_file_cache = (msg.body.arg1 != 0); if (use_inactive_file_cache) { numad_log(LOG_NOTICE, "Counting inactive file cache as available\n"); } else { numad_log(LOG_NOTICE, "Counting inactive file cache as unavailable\n"); } break; case 'H': thp_scan_sleep_ms = msg.body.arg1; set_thp_scan_sleep_ms(thp_scan_sleep_ms); break; case 'i': min_interval = msg.body.arg1; max_interval = msg.body.arg2; if (max_interval <= 0) { shut_down_numad(); } numad_log(LOG_NOTICE, "Changing interval to %d:%d\n", msg.body.arg1, msg.body.arg2); break; case 'K': keep_interleaved_memory = (msg.body.arg1 != 0); if (keep_interleaved_memory) { numad_log(LOG_NOTICE, "Keeping interleaved memory spread across nodes\n"); } else { numad_log(LOG_NOTICE, "Merging interleaved memory to localized NUMA nodes\n"); } break; case 'l': numad_log(LOG_NOTICE, "Changing log level to %d\n", msg.body.arg1); log_level = msg.body.arg1; break; case 'm': numad_log(LOG_NOTICE, "Changing target memory locality to %d\n", msg.body.arg1); target_memlocality = msg.body.arg1; break; case 'p': numad_log(LOG_NOTICE, "Adding PID %d to inclusion PID list\n", msg.body.arg1); pthread_mutex_lock(&pid_list_mutex); exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); include_pid_list = insert_pid_into_pid_list(include_pid_list, msg.body.arg1); pthread_mutex_unlock(&pid_list_mutex); break; case 'r': numad_log(LOG_NOTICE, "Removing PID %d from explicit PID lists\n", msg.body.arg1); pthread_mutex_lock(&pid_list_mutex); include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, msg.body.arg1); pthread_mutex_unlock(&pid_list_mutex); break; case 'S': scan_all_processes = (msg.body.arg1 != 0); if (scan_all_processes) { numad_log(LOG_NOTICE, "Scanning all processes\n"); } else { numad_log(LOG_NOTICE, "Scanning only explicit PID list processes\n"); } break; case 't': numad_log(LOG_NOTICE, "Changing logical CPU thread percent to %d\n", msg.body.arg1); htt_percent = msg.body.arg1; node_info_time_stamp = 0; // to force rescan of nodes/cpus soon break; case 'u': numad_log(LOG_NOTICE, "Changing target utilization to %d\n", msg.body.arg1); target_utilization = msg.body.arg1; break; case 'w': numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", msg.body.arg1, msg.body.arg2); pthread_mutex_lock(&node_info_mutex); update_nodes(); id_list_p node_list_p = pick_numa_nodes(-1, msg.body.arg1, msg.body.arg2, 0); str_from_id_list(buf, BUF_SIZE, node_list_p); pthread_mutex_unlock(&node_info_mutex); send_msg(msg.body.src_pid, 'w', 0, 0, buf); break; case 'x': numad_log(LOG_NOTICE, "Adding PID %d to exclusion PID list\n", msg.body.arg1); pthread_mutex_lock(&pid_list_mutex); include_pid_list = remove_pid_from_pid_list(include_pid_list, msg.body.arg1); exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, msg.body.arg1); pthread_mutex_unlock(&pid_list_mutex); break; default: numad_log(LOG_WARNING, "Unexpected msg command: %c %d %d %s from PID %d\n", msg.body.cmd, msg.body.arg1, msg.body.arg1, msg.body.text, msg.body.src_pid); break; } } // for (;;) } void parse_two_arg_values(char *p, int *first_ptr, int *second_ptr, int first_is_optional, int first_scale_digits) { char *orig_p = p; char *q = NULL; int second = -1; errno = 0; int first = (int) strtol(p, &p, 10); if ((errno != 0) || (p == orig_p) || (first < 0)) { fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p); exit(EXIT_FAILURE); } if (*p == '.') { p++; while ((first_scale_digits > 0) && (isdigit(*p))) { first *= 10; first += (*p++ - '0'); first_scale_digits -= 1; } while (isdigit(*p)) { p++; } } while (first_scale_digits > 0) { first *= 10; first_scale_digits -= 1; } if (*p == ':') { q = p + 1; errno = 0; second = (int) strtol(q, &p, 10); if ((errno != 0) || (p == q) || (second < 0)) { fprintf(stderr, "Can't parse arg value(s): %s\n", orig_p); exit(EXIT_FAILURE); } } if (q != NULL) { // Two numbers are present if (first_ptr != NULL) *first_ptr = first; if (second_ptr != NULL) *second_ptr = second; } else if (first_is_optional) { if (second_ptr != NULL) *second_ptr = first; } else { if (first_ptr != NULL) *first_ptr = first; } } int main(int argc, char *argv[]) { int opt; int C_flag = 0; int d_flag = 0; int H_flag = 0; int i_flag = 0; int K_flag = 0; int l_flag = 0; int m_flag = 0; int p_flag = 0; int r_flag = 0; int S_flag = 0; int t_flag = 0; int u_flag = 0; int v_flag = 0; int w_flag = 0; int x_flag = 0; int tmp_int = 0; long list_pid = 0; while ((opt = getopt(argc, argv, "C:dD:hH:i:K:l:p:r:R:S:t:u:vVw:x:")) != -1) { switch (opt) { case 'C': C_flag = 1; use_inactive_file_cache = (atoi(optarg) != 0); break; case 'd': d_flag = 1; log_level = LOG_DEBUG; break; case 'D': // obsoleted break; case 'h': print_usage_and_exit(argv[0]); break; case 'H': tmp_int = atoi(optarg); if ((tmp_int == 0) || ((tmp_int > 9) && (tmp_int < 1000001))) { // 0 means do not change the system default value H_flag = 1; thp_scan_sleep_ms = tmp_int; } else { fprintf(stderr, "THP scan_sleep_ms must be > 9 and < 1000001\n"); exit(EXIT_FAILURE); } break; case 'i': i_flag = 1; parse_two_arg_values(optarg, &min_interval, &max_interval, 1, 0); break; case 'K': K_flag = 1; keep_interleaved_memory = (atoi(optarg) != 0); break; case 'l': l_flag = 1; log_level = atoi(optarg); break; case 'm': tmp_int = atoi(optarg); if ((tmp_int >= 50) && (tmp_int <= 100)) { m_flag = 1; target_memlocality = tmp_int; } break; case 'p': p_flag = 1; list_pid = atol(optarg); exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); include_pid_list = insert_pid_into_pid_list(include_pid_list, list_pid); break; case 'r': r_flag = 1; list_pid = atol(optarg); // Remove this PID from both explicit pid lists. include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); exclude_pid_list = remove_pid_from_pid_list(exclude_pid_list, list_pid); break; case 'R': reserved_cpu_str = strdup(optarg); break; case 'S': S_flag = 1; scan_all_processes = (atoi(optarg) != 0); break; case 't': tmp_int = atoi(optarg); if ((tmp_int >= 0) && (tmp_int <= 100)) { t_flag = 1; htt_percent = tmp_int; } break; case 'u': tmp_int = atoi(optarg); if ((tmp_int >= 10) && (tmp_int <= 130)) { u_flag = 1; target_utilization = tmp_int; } break; case 'v': v_flag = 1; log_level = LOG_INFO; break; case 'V': print_version_and_exit(argv[0]); break; case 'w': w_flag = 1; parse_two_arg_values(optarg, &requested_cpus, &requested_mbs, 0, 2); break; case 'x': x_flag = 1; list_pid = atol(optarg); include_pid_list = remove_pid_from_pid_list(include_pid_list, list_pid); exclude_pid_list = insert_pid_into_pid_list(exclude_pid_list, list_pid); break; default: print_usage_and_exit(argv[0]); break; } } if (argc > optind) { fprintf(stderr, "Unexpected arg = %s\n", argv[optind]); exit(EXIT_FAILURE); } if (i_flag) { if ((max_interval < min_interval) && (max_interval != 0)) { fprintf(stderr, "Max interval (%d) must be greater than min interval (%d)\n", max_interval, min_interval); exit(EXIT_FAILURE); } } open_log_file(); init_msg_queue(); num_cpus = get_num_cpus(); page_size_in_bytes = sysconf(_SC_PAGESIZE); huge_page_size_in_bytes = get_huge_page_size_in_bytes(); // Figure out if this is the daemon, or a subsequent invocation int daemon_pid = get_daemon_pid(); if (daemon_pid > 0) { // Daemon is already running. So send dynamic options to persistant // thread to handle requests, get the response (if any), and finish. msg_t msg; if (C_flag) { send_msg(daemon_pid, 'C', use_inactive_file_cache, 0, ""); } if (H_flag) { send_msg(daemon_pid, 'H', thp_scan_sleep_ms, 0, ""); } if (i_flag) { send_msg(daemon_pid, 'i', min_interval, max_interval, ""); } if (K_flag) { send_msg(daemon_pid, 'K', keep_interleaved_memory, 0, ""); } if (d_flag || l_flag || v_flag) { send_msg(daemon_pid, 'l', log_level, 0, ""); } if (m_flag) { send_msg(daemon_pid, 'm', target_memlocality, 0, ""); } if (p_flag) { send_msg(daemon_pid, 'p', list_pid, 0, ""); } if (r_flag) { send_msg(daemon_pid, 'r', list_pid, 0, ""); } if (S_flag) { send_msg(daemon_pid, 'S', scan_all_processes, 0, ""); } if (t_flag) { send_msg(daemon_pid, 't', htt_percent, 0, ""); } if (u_flag) { send_msg(daemon_pid, 'u', target_utilization, 0, ""); } if (w_flag) { send_msg(daemon_pid, 'w', requested_cpus, requested_mbs, ""); recv_msg(&msg); fprintf(stdout, "%s\n", msg.body.text); } if (x_flag) { send_msg(daemon_pid, 'x', list_pid, 0, ""); } close_log_file(); exit(EXIT_SUCCESS); } // No numad daemon running yet. // First, make note of any reserved CPUs.... if (reserved_cpu_str != NULL) { CLEAR_CPU_LIST(reserved_cpu_mask_list_p); int n = add_ids_to_list_from_str(reserved_cpu_mask_list_p, reserved_cpu_str); char buf[BUF_SIZE]; str_from_id_list(buf, BUF_SIZE, reserved_cpu_mask_list_p); numad_log(LOG_NOTICE, "Reserving %d CPUs (%s) for non-numad use\n", n, buf); // turn reserved list into a negated mask for later ANDing use... negate_cpu_list(reserved_cpu_mask_list_p); } // If it is a "-w" pre-placement request, handle that without starting // the daemon. Otherwise start the numad daemon. if (w_flag) { // Get pre-placement NUMA advice without starting daemon update_nodes(); sleep(2); update_nodes(); numad_log(LOG_NOTICE, "Getting NUMA pre-placement advice for %d CPUs and %d MBs\n", requested_cpus, requested_mbs); id_list_p node_list_p = pick_numa_nodes(-1, requested_cpus, requested_mbs, 0); char buf[BUF_SIZE]; str_from_id_list(buf, BUF_SIZE, node_list_p); fprintf(stdout, "%s\n", buf); close_log_file(); exit(EXIT_SUCCESS); } else if (max_interval > 0) { // Start the numad daemon... check_prereqs(argv[0]); #if (!NO_DAEMON) // Daemonize self... daemon_pid = fork(); if (daemon_pid < 0) { numad_log(LOG_CRIT, "fork() failed\n"); exit(EXIT_FAILURE); } // Parent process now exits if (daemon_pid > 0) { exit(EXIT_SUCCESS); } // Child process continues... umask(S_IWGRP | S_IWOTH); // Reset the file mode int sid = setsid(); // Start a new session if (sid < 0) { numad_log(LOG_CRIT, "setsid() failed\n"); exit(EXIT_FAILURE); } if ((chdir("/")) < 0) { numad_log(LOG_CRIT, "chdir() failed"); exit(EXIT_FAILURE); } daemon_pid = register_numad_pid(); if (daemon_pid != getpid()) { numad_log(LOG_CRIT, "Could not register daemon PID\n"); exit(EXIT_FAILURE); } fclose(stdin); fclose(stdout); if (log_fs != stderr) { fclose(stderr); } #endif // Set up signal handlers struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_handler = sig_handler; if (sigaction(SIGHUP, &sa, NULL) || sigaction(SIGTERM, &sa, NULL) || sigaction(SIGQUIT, &sa, NULL)) { numad_log(LOG_CRIT, "sigaction does not work?\n"); exit(EXIT_FAILURE); } // Allocate initial process hash table process_hash_table_expand(); // Spawn a thread to handle messages from subsequent invocation requests pthread_mutex_init(&pid_list_mutex, NULL); pthread_mutex_init(&node_info_mutex, NULL); pthread_attr_t attr; if (pthread_attr_init(&attr) != 0) { numad_log(LOG_CRIT, "pthread_attr_init failure\n"); exit(EXIT_FAILURE); } pthread_t tid; if (pthread_create(&tid, &attr, &set_dynamic_options, &tid) != 0) { numad_log(LOG_CRIT, "pthread_create failure: setting thread\n"); exit(EXIT_FAILURE); } // Loop here forwever... for (;;) { int interval = max_interval; pthread_mutex_lock(&node_info_mutex); int nodes = update_nodes(); pthread_mutex_unlock(&node_info_mutex); if (nodes > 1) { update_processes(); interval = manage_loads(); if (interval < max_interval) { // Update node info since we moved something nodes = update_nodes(); } } sleep(interval); if (got_sigterm | got_sigquit) { shut_down_numad(); } if (got_sighup) { got_sighup = 0; close_log_file(); open_log_file(); } } if (pthread_attr_destroy(&attr) != 0) { numad_log(LOG_WARNING, "pthread_attr_destroy failure\n"); } pthread_mutex_destroy(&pid_list_mutex); pthread_mutex_destroy(&node_info_mutex); } exit(EXIT_SUCCESS); } numad-0.5+20150602/numad.conf000066400000000000000000000001331253330764600153360ustar00rootroot00000000000000# Config file for numad # # Default INTERVAL is 15 # modify below to change it INTERVAL=15 numad-0.5+20150602/numad.init000077500000000000000000000035231253330764600153650ustar00rootroot00000000000000#!/bin/bash # chkconfig: - 99 1 # description: Control operation of numad which will monitor and # rebalance assignment of NUMA resources ### BEGIN INIT INFO # Provides: numad # Required-Start: cgconfig # Required-Stop: # Should-Start: # Should-Stop: # Default-Start: # Default-Stop: # Short-Description: numad control # Description: ### END INIT INFO if [ $(id -u) -ne 0 ]; then echo "This script can be run by root only. Exiting." exit 4 fi # Source function library. . /etc/rc.d/init.d/functions exec="/usr/bin/numad" prog="numad" config="/etc/numad.conf" [ -e /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog lockfile=/var/lock/subsys/$prog base=${0##*/} start() { [ -x $exec ] || exit 5 [ -f $config ] || exit 6 echo -n $"Starting $prog: " . $config daemon "$exec -i $INTERVAL" retval=$? echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { echo -n $"Stopping $prog: " killproc $prog retval=$? echo [ $retval -eq 0 ] && rm -f $lockfile return $retval } restart() { stop start } reload() { restart } force_reload() { restart } rh_status() { # run checks to determine if the service is running or use generic status status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? numad-0.5+20150602/numad.logrotate000066400000000000000000000001521253330764600164120ustar00rootroot00000000000000/var/log/numad.log { compress copytruncate maxage 60 missingok rotate 5 size 1M } numad-0.5+20150602/numad.service000066400000000000000000000003051253330764600160520ustar00rootroot00000000000000[Unit] Description=numad - The NUMA daemon that manages application locality. Documentation=man:numad [Service] Type=forking ExecStart=/usr/bin/numad -i 15 [Install] WantedBy=multi-user.target