das_watchdog-0.9.0/0000775000076400007640000000000011145611524013507 5ustar kjetilkjetildas_watchdog-0.9.0/test_rt.c0000644000076400007640000000177510374455424015357 0ustar kjetilkjetil#include #include #include #include #include #include int main() { struct sched_param params; int done = 0; struct timeval tv; time_t startsec; unsigned long int loops; int outer_loops; params.sched_priority = 1; pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶ms); { struct sched_param par; par.sched_priority = sched_get_priority_min(SCHED_FIFO); if(sched_setscheduler(0,SCHED_FIFO,&par)==-1){ fprintf(stderr,"das_watchdog: Unable to set SCHED_FIFO realtime priority for the watchdog thread. Exiting.\n"); return 0; } } gettimeofday(&tv,0); startsec = tv.tv_sec; printf("My pid is %d\n",getpid()); sleep(1); #if 0 while(!done) { gettimeofday(&tv, 0); if(tv.tv_sec - startsec > 10) done = 1; } #else for (outer_loops = 0; outer_loops < 10; ++outer_loops) { for (loops = 0; loops < 1000000000; loops++) { done += 1; //sched_yield(); } } #endif exit (EXIT_SUCCESS); } das_watchdog-0.9.0/das_watchdog.rc0000755000076400007640000000222011145560612016462 0ustar kjetilkjetil#! /bin/sh # #Written by Miquel van Smoorenburg . #Modified for Debian GNU/Linux #by Ian Murdock . # Modified for das_watchdog by # Stefan kerstens comments: # i've attached a simple /etc/init.d script; on debian you can # do: # cp das_watchdog.rc /etc/init.d/das_watchdog # update-rc.d das_watchdog defaults # invoke-rc.d das_watchdog start PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin DAEMON=/usr/local/sbin/das_watchdog NAME=das_watchdog DESC="Watchdog" test -x $DAEMON || exit 0 set -e case "$1" in start) echo -n "Starting $DESC: $NAME" start-stop-daemon --start --quiet --background --exec $DAEMON echo "." ;; stop) echo -n "Stopping $DESC: $NAME" start-stop-daemon --stop --quiet --oknodo --exec $DAEMON echo "." ;; restart|reload|force-reload) echo -n "Restarting $DESC: $NAME... " start-stop-daemon --stop --quiet --oknodo --exec $DAEMON start-stop-daemon --start --quiet --background --exec $DAEMON echo "done." ;; *) N=/etc/init.d/$NAME echo "Usage: $N {start|stop|restart|reload|force-reload}" >&2 exit 1 ;; esac exit 0 das_watchdog-0.9.0/Makefile0000644000076400007640000000100411145611473015143 0ustar kjetilkjetil VERSION=0.9.0 FLAGS=-O2 `pkg-config --libs --cflags libgtop-2.0` -Wall -lpthread -DWHICH_XMESSAGE=\"`which xmessage`\" -DVERSION=\"$(VERSION)\" TAR=das_watchdog DIST=$(TAR)-$(VERSION) all: which xmessage gcc das_watchdog.c -o das_watchdog $(FLAGS) gcc test_rt.c -o test_rt dist: make clean rm -fr $(DIST).tar.gz /tmp/$(DIST) mkdir /tmp/$(DIST) cp -a * /tmp/$(DIST) mv /tmp/$(DIST) . tar cvf $(DIST).tar $(DIST) gzip $(DIST).tar marcel_upload $(DIST).tar.gz clean: rm -f das_watchdog test_rt *~ das_watchdog-0.9.0/README0000644000076400007640000001300211145563702014365 0ustar kjetilkjetil Das_Watchdog V0.9.0 Released 14.2.2009 Kjetil S. Matheussen, k.s.matheussen@notam02.no ABOUT ----- Das_Watchdog is a general watchdog for the linux operating system that should run in the background at all times to ensure a realtime process won't hang the machine. Das_Watchdog is inspired by the rt_watchdog program made by Florian Schmidt: http://tapas.affenbande.org/?page_id=38 But das_watchdog has some improvements over rt_watchdog: 1. It works with 2.4 kernels as well as 2.6. 2. Instead of permanently setting all realtime processes to run non-realtime, das_watchdog only sets them temporary. 3. When the watchdog kicks in, an X window should pop up that tells you whats happening. (just close it after reading the message). REQUIREMENTS ------------ xmessage (should be a part of X11) libgtop2 (should be included with gnome. No, das_watchdog is not a gnome-program, it only uses libgtop2.) COMPILATION ---------- make FEDORA INSTALLATION ------------------- cp das_watchdog /usr/local/sbin/ echo '/usr/local/sbin/das_watchdog >/dev/null &' >>/etc/rc.sysinit reboot GENTOO INSTALLATION ------------------- # from the proaudio repository: emerge -va das_watchdog reboot DEBIAN INSTALLATION ------------------- cp das_watchdog /usr/local/sbin/ cp das_watchdog.rc /etc/init.d/das_watchdog update-rc.d das_watchdog defaults invoke-rc.d das_watchdog start reboot USAGE ----- Whenever a program locks up the machine, the watchdog temporarily sets all realtime process to non-realtime for 8 seconds. You will get an xmessage window up on the screen whenever that happens. To test it, run the attached program "test_rt", which immediatley freezes your machine. However, a window should pop up after about 5-6 seconds telling you that the watchdog set the process to non-realtime. (If you have two processors, you must run test_rt two times, and so on.) CHANGES ------- 0.3.2->0.9.0 * Removed timer process testing. This was only a problem with older 2.6 kernels. (think it was fixed early 2006 or thereabout). No scary messages printed to the screen anymore. * Tested on Fedora core 10. * Cleaned up documentation a bit and added instructions for installing on Fedora, Gentoo and Debian. 0.3.1->0.3.2 *Fixed manual. Newer linux kernels all seems to have properly working timing, so using the --force option should never be necessary anymore. 0.2.5->0.3.1 *Changed scheme for finding correct XAUTHORITY environment variable. (Now works with Fedora Core 6) Hopefully, these changes should increase the chance of seeing the xmessage and avoid seeing multiple ones. (Theres no correct way to do this, so please send me the output of "uname -a" in case you don't see any window) *Added syslogging. *Added the --version argument. 0.2.4->0.2.5 *Let the test thread run with SCHED_FIFO priority as well, using the lowest priority. 0.2.3->0.2.4 *Test if the xmessage program found during the make process is a valid executable. If not, search the $PATH instead. This should fix it for Gentoo when the pro-audio overlay is updated to at least this version. *Various modifications for the High Res Timer, which should be used instead of setting the timer interrupt process to SCHED_FIFO/99. 0.2.2->0.2.3 *Fixed commandline arguments for increasetime, checktime and waittime. *Nicified source a bit 0.2.1->0.2.2 *Locked down memory. Don't know if its necessary. 0.2.0->0.2.1 *Cleaned up source a bit. *Properly find number of timer processes. *Added shortcuts for optargs and beautified the source a bit. 0.1.2->0.2.0 *Don't do anything if no process priorities are changed, when watchdogging. *Added the --force option, that sets the priority of all timer processes to FIFO/99. *Added the das_watchdog /etc/init.d script provided by Stefan Kersten. (das_watchdog.rc) *Added the --verbose option. *Check that its the same process when setting back old priority. *Don't set back to old priority if the priority has been changed in the mean time. *Added options for setting increasetime, checktime and waittime. (--increasetime, --checktime and --waittime) *Don't change the priority of any timer process when watchdogging. *Smaller code cleanups. 0.1.1->0.1.2 * Added check for the ksoftrqd/0 process as well as the softirq-timer/0 process. * Added check for SCHED_OTHER of the timing process as well as priority. * Removed debug-printing. 0.1.0->0.1.1 * Added extensive checks both when compiling and when running about the priority of the "softirq-timer/0" process: - ***If "softirq-timer/0" is not set to a very high priority (99), the watchdog most probably will not work.*** - The default priority for softirq-timer/0 seems to be 1. However, for real time work, it must be set higher to get reliable timing. Set it to 99. - If softirq-timer/0 is set to less than 99, das_watchdog will refuse to compile unless you force it to by editing the makefile. When running das_watchdog, it will only give a warning if the priority is set too low. * Changed the DISPLAY environment variable to ":0.0" instead of "localhost:0.0". Seems to work for everyone now. * Switched from libgtop to libgtop2. 0.0.2->0.1.0 * Properly setting the DISPLAY and XAUTHORITY environment variables in various ways to make sure the message is really shown. (It really works now!) 0.0.1->0.0.2 *Use xmessage instead of wish. (much nicer) *Run system("xhost +") and setenv("DISPLAY",":0.0",1) before running xmessage. ACKNOWLEDGEMENT --------------- The program is mentally based on Florian Schmidts program rt_watcdog. Florian Schmidt also wrote the test_rt program. das_watchdog-0.9.0/das_watchdog.c0000644000076400007640000003706311145557115016316 0ustar kjetilkjetil/* Kjetil Matheussen 2006. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ // Only necessary with old 2.6 kernel (before jan 2006 or thereabout). // 2.4 and newer 2.6 works fine. #define TIMERCHECKS 0 #include #include #include #include #include #include #include #include #include #include #include #include #include #if LIBGTOP_MAJOR_VERSION<2 # include #endif #include #include #if LIBGTOP_MAJOR_VERSION<2 typedef u_int64_t ui64; #else typedef guint64 ui64; #endif #define OPTARGS_BEGIN(das_usage) {int lokke;const char *usage=das_usage;for(lokke=1;lokkeproclist=get_proclist(&pll->length); return pll; } static void pll_delete(struct proclistlist *pll){ free(pll->proclist); free(pll); } static pid_t name2pid(char *name){ pid_t pid=-1; int lokke; int num_procs=0; struct das_proclist *proclist=get_proclist(&num_procs); for(lokke=0;lokkelength); int lokke; *num_users=0; for(lokke=0;lokkelength;lokke++){ glibtop_proc_uid uid; glibtop_get_proc_uid(&uid,pll->proclist[lokke].pid); if( ! is_a_member(uid.uid,ret,*num_users)){ // ??? ret[*num_users]=uid.uid; (*num_users)++; } } return ret; } static int gettimerpid(char *name,int cpu){ pid_t pid; char temp[500]; if(name==NULL) name=&temp[0]; sprintf(name,"softirq-timer/%d",cpu); pid=name2pid(name); if(pid==-1){ sprintf(name,"ksoftirqd/%d",cpu); pid=name2pid(name); } return pid; } #if TIMERCHECKS static int checksoftirq2(int force,int cpu){ char name[500]; pid_t pid=gettimerpid(&name[0],cpu); if(pid==-1) return 0; { int policy=sched_getscheduler(pid); int priority=get_pid_priority(pid); if(prioritylength) return 0; { char *xa_filename=get_pid_environ_val(pll->proclist[lokke].pid,"XAUTHORITY"); if(xa_filename!=NULL){ if(send_xmessage(xa_filename,message)==1){ free(xa_filename); return 1; } } free(xa_filename); } return send_xmessage_using_XAUTHORITY(pll,lokke+1,message); } int send_xmessage_using_uids(struct proclistlist *pll, char *message){ int num_users; int lokke; int *uids=get_userlist(pll,&num_users); for(lokke=0;lokkepw_dir); if(send_xmessage(xauthpath,message)==1){ free(uids); return 1; } } free(uids); return 0; } static void xmessage_fork(struct proclistlist *pll){ char message[5000]; set_pid_priority(0,SCHED_FIFO,sched_get_priority_min(SCHED_FIFO),"Unable to set SCHED_FIFO for %d (\"%s\"). (%s)", "the xmessage fork"); setenv("DISPLAY",":0.0",1); if( ! xmessage_found) sprintf(message,"xmessage \"WARNING! das_watchdog pauses realtime operations for %d seconds.\"",waittime); else sprintf(message,"%s \"WARNING! das_watchdog pauses realtime operations for %d seconds.\"",WHICH_XMESSAGE,waittime); if(send_xmessage_using_uids(pll,message)==0){ set_pid_priority(0,SCHED_OTHER,0,"Unable to set SCHED_OTHER for %d (\"%s\"). (%s)", "the xmessage fork"); // send_xmessage_using_XAUTHRITY is too heavy to run in realtime. send_xmessage_using_XAUTHORITY(pll,0,message); } pll_delete(pll); } // The SCHED_OTHER thread. static void *counter_func(void *arg){ { set_pid_priority(0,SCHED_FIFO,sched_get_priority_min(SCHED_FIFO),"Unable to set SCHED_FIFO for %d (\"%s\"). (%s)", "the counter_func"); } for(;;){ counter++; if(verbose) print_error(stderr,"counter set to %d",counter); sleep(increasetime); } return NULL; } int main(int argc,char **argv){ pid_t mypid=getpid(); pthread_t counter_thread={0}; int num_cpus=0; int *timerpids; #if TIMERCHECKS int force=0; #endif int testing=0; // Get timer pids { // Find number of timer processes. while(gettimerpid(NULL,num_cpus)!=-1) num_cpus++; timerpids=malloc(sizeof(int)*num_cpus); { int cpu=0; for(cpu=0;cpu Prints out version.\n" "[--test] or [-te] -> Run a test to see if xmessage is working.\n") { OPTARG("--verbose","-v") verbose=1; #if TIMERCHECKS OPTARG("--force","-f") force=1; OPTARG("--checkirq","-c") checkirq=1; return(checksoftirq(0)); #endif OPTARG("--increasetime","-it") increasetime=OPTARG_GETINT(); OPTARG("--checktime","-ct") checktime=OPTARG_GETINT(); OPTARG("--waittime","-wt") waittime=OPTARG_GETINT(); OPTARG("--test","-te") testing=1; verbose=1; OPTARG("--version","-ve") printf("Das Version die Uhr Hund %s nach sein bist.\n",VERSION);exit(0); }OPTARGS_END; // Logging to /var/log/messages { openlog("das_watchdog", 0, LOG_DAEMON); syslog(LOG_INFO, "started"); } // Check various. { #if TIMERCHECKS if(force && checksoftirq(force)<0) return -2; checksoftirq(force); #endif if(getuid()!=0){ print_error(stdout,"Warning, you are not running as root. das_watchdog should be run as an init-script at startup, and not as a normal user.\n"); } if(access(WHICH_XMESSAGE,X_OK)!=0){ print_error(stderr,"Warning, \"xmessage\" is not found or is not an executable. I will try to use the $PATH instead. Hopefully that'll work,"); print_error(stderr,"but you might not receive messages to the screen in case das_watchdog has to take action."); xmessage_found=0; } } // Set priority if(1) { if( ! set_pid_priority(0,SCHED_FIFO,sched_get_priority_max(SCHED_FIFO), "Unable to set SCHED_FIFO realtime priority for %d (\"%s\"). (%s). Exiting.", "Der Gewinde nach die Uhr Hund")) return 0; if(mlockall(MCL_CURRENT|MCL_FUTURE)==-1) print_error(stderr,"Could not call mlockalll(MCL_CURRENT|MCL_FUTURE) (%s)",strerror(errno)); } // Start child thread. { pthread_create(&counter_thread,NULL,counter_func,NULL); } // Main loop. (We are never supposed to exit from this one.) for(;;){ int lastcounter=counter; sleep(checktime); if(verbose) print_error(stderr," counter read to be %d (lastcounter=%d)",counter,lastcounter); if(lastcounter==counter || testing==1){ int changedsched=0; struct proclistlist *pll=pll_create(); int lokke; if(verbose) print_error(stdout,"Die Uhr Hund stossen sein!"); for(lokke=0;lokkelength;lokke++){ if(pll->proclist[lokke].policy!=SCHED_OTHER && pll->proclist[lokke].pid!=mypid && (!is_a_member(pll->proclist[lokke].pid,timerpids,num_cpus)) ) { struct sched_param par={0}; par.sched_priority=0; if(verbose) print_error(stdout,"Setting pid %d temporarily to SCHED_OTHER.",pll->proclist[lokke].pid); if(set_pid_priority(pll->proclist[lokke].pid,SCHED_OTHER,0,"Could not set pid %d (\"%s\") to SCHED_OTHER (%s).\n","no name")) changedsched++; } } if(changedsched>0 || testing==1){ { char message[5000]; sprintf(message,"realtime operations paused for %d seconds.",waittime); syslog(LOG_INFO,message); } if(fork()==0){ xmessage_fork(pll); return 0; } sleep(waittime); for(lokke=0;lokkelength;lokke++){ if(pll->proclist[lokke].policy != SCHED_OTHER && pll->proclist[lokke].pid != mypid && (!is_a_member(pll->proclist[lokke].pid,timerpids,num_cpus)) && pll->proclist[lokke].start_time == get_pid_start_time(pll->proclist[lokke].pid) ) { if(get_pid_priority(pll->proclist[lokke].pid) != 0 || sched_getscheduler(pll->proclist[lokke].pid) != SCHED_OTHER){ print_error(stderr, "Seems like someone else has changed priority and/or scheduling policy for %d in the mean time. I'm not going to do anything.", pll->proclist[lokke].pid); }else{ struct sched_param par={0}; par.sched_priority=pll->proclist[lokke].priority; if(verbose) print_error(stdout,"Setting pid %d back to realtime priority.",pll->proclist[lokke].pid); set_pid_priority(pll->proclist[lokke].pid,pll->proclist[lokke].policy,pll->proclist[lokke].priority,"Could not set pid %d (\"%s\") to SCHED_FIFO/SCHED_RR (%s).\n\n", "no name"); } } } } pll_delete(pll); } if(testing==1) break; } return 0; }