corosync 3.1.7
wd.c
Go to the documentation of this file.
1/*
2 * Copyright (c) 2010-2012 Red Hat, Inc.
3 *
4 * All rights reserved.
5 *
6 * Author: Angus Salkeld <asalkeld@redhat.com>
7 *
8 * This software licensed under BSD license, the text of which follows:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * - Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 * - Neither the name of the MontaVista Software, Inc. nor the names of its
19 * contributors may be used to endorse or promote products derived from this
20 * software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
32 * THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#include <config.h>
36
37#include <unistd.h>
38#include <fcntl.h>
39#include <sys/ioctl.h>
40#include <linux/types.h>
41#include <linux/watchdog.h>
42#include <sys/reboot.h>
43
44#include <corosync/corotypes.h>
45#include <corosync/corodefs.h>
46#include <corosync/coroapi.h>
47#include <qb/qblist.h>
48#include <corosync/logsys.h>
49#include <corosync/icmap.h>
50#include "fsm.h"
51
52#include "service.h"
53
54typedef enum {
60
61struct resource {
63 char *recovery;
66 struct cs_fsm fsm;
67
69 uint64_t check_timeout;
71};
72
74
75/*
76 * Service Interfaces required by service_message_handler struct
77 */
78static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api);
79static int wd_exec_exit_fn (void);
80static void wd_resource_check_fn (void* resource_ref);
81
82static struct corosync_api_v1 *api;
83#define WD_DEFAULT_TIMEOUT_SEC 6
84#define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC)
85#define WD_MIN_TIMEOUT_MS 500
86#define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC)
87static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC;
88static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2);
89static int dog = -1;
90static corosync_timer_handle_t wd_timer;
91static int watchdog_ok = 1;
92static char *watchdog_device = NULL;
93
95 .name = "corosync watchdog service",
96 .id = WD_SERVICE,
97 .priority = 1,
98 .private_data_size = 0,
100 .lib_init_fn = NULL,
101 .lib_exit_fn = NULL,
102 .lib_engine = NULL,
103 .lib_engine_count = 0,
104 .exec_engine = NULL,
105 .exec_engine_count = 0,
106 .confchg_fn = NULL,
107 .exec_init_fn = wd_exec_init_fn,
108 .exec_exit_fn = wd_exec_exit_fn,
109 .exec_dump_fn = NULL
110};
111
112static QB_LIST_DECLARE (confchg_notify);
113
114/*
115 * F S M
116 */
117static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
118static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
119
125
130
131const char * wd_running_str = "running";
132const char * wd_failed_str = "failed";
133const char * wd_failure_str = "failure";
134const char * wd_stopped_str = "stopped";
135const char * wd_config_changed_str = "config_changed";
136
138 { WD_S_STOPPED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_STOPPED, WD_S_RUNNING, -1} },
139 { WD_S_STOPPED, WD_E_FAILURE, NULL, {-1} },
140 { WD_S_RUNNING, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
141 { WD_S_RUNNING, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
142 { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
143 { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
144};
145
147{
148 return (&wd_service_engine);
149}
150
151static const char * wd_res_state_to_str(struct cs_fsm* fsm,
152 int32_t state)
153{
154 switch (state) {
155 case WD_S_STOPPED:
156 return wd_stopped_str;
157 break;
158 case WD_S_RUNNING:
159 return wd_running_str;
160 break;
161 case WD_S_FAILED:
162 return wd_failed_str;
163 break;
164 }
165 return NULL;
166}
167
168static const char * wd_res_event_to_str(struct cs_fsm* fsm,
169 int32_t event)
170{
171 switch (event) {
174 break;
175 case WD_E_FAILURE:
176 return wd_failure_str;
177 break;
178 }
179 return NULL;
180}
181
182static void wd_fsm_cb (struct cs_fsm *fsm, int cb_event, int32_t curr_state,
183 int32_t next_state, int32_t fsm_event, void *data)
184{
185 switch (cb_event) {
187 log_printf (LOGSYS_LEVEL_ERROR, "Fsm:%s could not find event \"%s\" in state \"%s\"",
188 fsm->name, fsm->event_to_str(fsm, fsm_event), fsm->state_to_str(fsm, curr_state));
190 break;
192 log_printf (LOGSYS_LEVEL_INFO, "Fsm:%s event \"%s\", state \"%s\" --> \"%s\"",
193 fsm->name,
194 fsm->event_to_str(fsm, fsm_event),
195 fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
196 fsm->state_to_str(fsm, next_state));
197 break;
199 log_printf (LOGSYS_LEVEL_CRIT, "Fsm:%s Can't change state from \"%s\" to \"%s\" (event was \"%s\")",
200 fsm->name,
201 fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
202 fsm->state_to_str(fsm, next_state),
203 fsm->event_to_str(fsm, fsm_event));
205 break;
206 default:
207 log_printf (LOGSYS_LEVEL_CRIT, "Fsm: Unknown callback event!");
209 break;
210 }
211}
212
213/*
214 * returns (CS_TRUE == OK, CS_FALSE == failed)
215 */
216static int32_t wd_resource_state_is_ok (struct resource *ref)
217{
218 char* state = NULL;
219 uint64_t last_updated;
220 uint64_t my_time;
221 uint64_t allowed_period;
222 char key_name[ICMAP_KEYNAME_MAXLEN];
223
224 if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "last_updated") >= ICMAP_KEYNAME_MAXLEN) ||
225 (icmap_get_uint64(key_name, &last_updated) != CS_OK)) {
226 /* key does not exist.
227 */
228 return CS_FALSE;
229 }
230
231 if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state") >= ICMAP_KEYNAME_MAXLEN) ||
232 (icmap_get_string(key_name, &state) != CS_OK || strcmp(state, "disabled") == 0)) {
233 /* key does not exist.
234 */
235 if (state != NULL)
236 free(state);
237
238 return CS_FALSE;
239 }
240
241 if (last_updated == 0) {
242 /* initial value */
243 free(state);
244 return CS_TRUE;
245 }
246
247 my_time = cs_timestamp_get();
248
249 /*
250 * Here we check that the monitor has written a timestamp within the poll_period
251 * plus a grace factor of (0.5 * poll_period).
252 */
253 allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2;
254 if ((last_updated + allowed_period) < my_time) {
256 "last_updated %"PRIu64" ms too late, period:%"PRIu64".",
257 (uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((last_updated + allowed_period) / MILLI_2_NANO_SECONDS)),
258 ref->check_timeout);
259 free(state);
260 return CS_FALSE;
261 }
262
263 if (strcmp (state, wd_failed_str) == 0) {
264 free(state);
265 return CS_FALSE;
266 }
267
268 free(state);
269 return CS_TRUE;
270}
271
272static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
273{
274 char *state;
275 uint64_t tmp_value;
276 uint64_t next_timeout;
277 struct resource *ref = (struct resource*)data;
278 char key_name[ICMAP_KEYNAME_MAXLEN];
279
280 next_timeout = ref->check_timeout;
281
282 if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "poll_period") >= ICMAP_KEYNAME_MAXLEN) ||
283 (icmap_get_uint64(ref->res_path, &tmp_value) == CS_OK)) {
284 if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
286 "poll_period changing from:%"PRIu64" to %"PRIu64".",
287 ref->check_timeout, tmp_value);
288 /*
289 * To easy in the transition between poll_period's we are going
290 * to make the first timeout the bigger of the new and old value.
291 * This is to give the monitoring system time to adjust.
292 */
293 next_timeout = CS_MAX(tmp_value, ref->check_timeout);
294 ref->check_timeout = tmp_value;
295 } else {
297 "Could NOT use poll_period:%"PRIu64" ms for resource %s",
298 tmp_value, ref->name);
299 }
300 }
301
302 if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "recovery") >= ICMAP_KEYNAME_MAXLEN) ||
303 (icmap_get_string(key_name, &ref->recovery) != CS_OK)) {
304 /* key does not exist.
305 */
307 "resource %s missing a recovery key.", ref->name);
308 cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
309 return;
310 }
311 if ((snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state") >= ICMAP_KEYNAME_MAXLEN) ||
312 (icmap_get_string(key_name, &state) != CS_OK)) {
313 /* key does not exist.
314 */
316 "resource %s missing a state key.", ref->name);
317 cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
318 return;
319 }
320 if (ref->check_timer) {
321 api->timer_delete(ref->check_timer);
322 ref->check_timer = 0;
323 }
324
325 if (strcmp(wd_stopped_str, state) == 0) {
326 cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
327 } else {
328 api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS,
329 ref, wd_resource_check_fn, &ref->check_timer);
330 cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
331 }
332 free(state);
333}
334
335static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
336{
337 struct resource* ref = (struct resource*)data;
338
339 if (ref->check_timer) {
340 api->timer_delete(ref->check_timer);
341 ref->check_timer = 0;
342 }
343
344 log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
345 ref->recovery, (char*)ref->name);
346 if (strcmp (ref->recovery, "watchdog") == 0 ||
347 strcmp (ref->recovery, "quit") == 0) {
348 watchdog_ok = 0;
349 }
350 else if (strcmp (ref->recovery, "reboot") == 0) {
351 reboot(RB_AUTOBOOT);
352 }
353 else if (strcmp (ref->recovery, "shutdown") == 0) {
354 reboot(RB_POWER_OFF);
355 }
356 cs_fsm_state_set(fsm, WD_S_FAILED, data, wd_fsm_cb);
357}
358
359static void wd_key_changed(
360 int32_t event,
361 const char *key_name,
362 struct icmap_notify_value new_val,
363 struct icmap_notify_value old_val,
364 void *user_data)
365{
366 struct resource* ref = (struct resource*)user_data;
367 char *last_key_part;
368
369 if (ref == NULL) {
370 return ;
371 }
372
373 last_key_part = strrchr(key_name, '.');
374 if (last_key_part == NULL) {
375 return ;
376 }
377 last_key_part++;
378
379 if (event == ICMAP_TRACK_ADD || event == ICMAP_TRACK_MODIFY) {
380 if (strcmp(last_key_part, "last_updated") == 0 ||
381 strcmp(last_key_part, "current") == 0) {
382 return;
383 }
384
385 cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref, wd_fsm_cb);
386 }
387
388 if (event == ICMAP_TRACK_DELETE && ref != NULL) {
389 if (strcmp(last_key_part, "state") != 0) {
390 return ;
391 }
392
394 "resource \"%s\" deleted from cmap!",
395 ref->name);
396
397 api->timer_delete(ref->check_timer);
398 ref->check_timer = 0;
400
401 free(ref);
402 }
403}
404
405static void wd_resource_check_fn (void* resource_ref)
406{
407 struct resource* ref = (struct resource*)resource_ref;
408
409 if (wd_resource_state_is_ok (ref) == CS_FALSE) {
410 cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref, wd_fsm_cb);
411 return;
412 }
414 ref, wd_resource_check_fn, &ref->check_timer);
415}
416
417/*
418 * return 0 - fully configured
419 * return -1 - partially configured
420 */
421static int32_t wd_resource_create (char *res_path, char *res_name)
422{
423 char *state;
424 uint64_t tmp_value;
425 struct resource *ref = calloc (1, sizeof (struct resource));
426 char key_name[ICMAP_KEYNAME_MAXLEN];
427
428 strcpy(ref->res_path, res_path);
430 ref->check_timer = 0;
431
432 strcpy(ref->name, res_name);
433 ref->fsm.name = ref->name;
434 ref->fsm.table = wd_fsm_table;
435 ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
436 ref->fsm.curr_entry = 0;
438 ref->fsm.state_to_str = wd_res_state_to_str;
439 ref->fsm.event_to_str = wd_res_event_to_str;
440
441 snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "poll_period");
442 if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
443 icmap_set_uint64(key_name, ref->check_timeout);
444 } else {
445 if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
446 ref->check_timeout = tmp_value;
447 } else {
449 "Could NOT use poll_period:%"PRIu64" ms for resource %s",
450 tmp_value, ref->name);
451 }
452 }
453
456 wd_key_changed,
457 ref, &ref->icmap_track);
458
459 snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "recovery");
460 if (icmap_get_string(key_name, &ref->recovery) != CS_OK) {
461 /* key does not exist.
462 */
464 "resource %s missing a recovery key.", ref->name);
465 return -1;
466 }
467 snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "state");
468 if (icmap_get_string(key_name, &state) != CS_OK) {
469 /* key does not exist.
470 */
472 "resource %s missing a state key.", ref->name);
473 return -1;
474 }
475
476 snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "last_updated");
477 if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
478 /* key does not exist.
479 */
480 ref->last_updated = 0;
481 } else {
482 ref->last_updated = tmp_value;
483 }
484
485 /*
486 * delay the first check to give the monitor time to start working.
487 */
488 tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS);
490 ref,
491 wd_resource_check_fn, &ref->check_timer);
492
493 cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
494 return 0;
495}
496
497
498static void wd_tickle_fn (void* arg)
499{
500 ENTER();
501
502 if (watchdog_ok) {
503 if (dog > 0) {
504 ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
505 }
506 api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
507 wd_tickle_fn, &wd_timer);
508 }
509 else {
510 log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
511 }
512
513}
514
515static void wd_resource_created_cb(
516 int32_t event,
517 const char *key_name,
518 struct icmap_notify_value new_val,
519 struct icmap_notify_value old_val,
520 void *user_data)
521{
522 char res_name[ICMAP_KEYNAME_MAXLEN];
523 char res_type[ICMAP_KEYNAME_MAXLEN];
524 char tmp_key[ICMAP_KEYNAME_MAXLEN];
525 int res;
526
527 if (event != ICMAP_TRACK_ADD) {
528 return ;
529 }
530
531 res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
532 if (res != 3) {
533 return ;
534 }
535
536 if (strcmp(tmp_key, "state") != 0) {
537 return ;
538 }
539
540 snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
541 wd_resource_create (tmp_key, res_name);
542}
543
544static void wd_scan_resources (void)
545{
546 int res_count = 0;
548 icmap_iter_t iter;
549 const char *key_name;
550 int res;
551 char res_name[ICMAP_KEYNAME_MAXLEN];
552 char res_type[ICMAP_KEYNAME_MAXLEN];
553 char tmp_key[ICMAP_KEYNAME_MAXLEN];
554
555 ENTER();
556
557 iter = icmap_iter_init("resources.");
558 while ((key_name = icmap_iter_next(iter, NULL, NULL)) != NULL) {
559 res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
560 if (res != 3) {
561 continue ;
562 }
563
564 if (strcmp(tmp_key, "state") != 0) {
565 continue ;
566 }
567
568 snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
569 if (wd_resource_create (tmp_key, res_name) == 0) {
570 res_count++;
571 }
572 }
574
575 icmap_track_add("resources.process.", ICMAP_TRACK_ADD | ICMAP_TRACK_PREFIX,
576 wd_resource_created_cb, NULL, &icmap_track);
578 wd_resource_created_cb, NULL, &icmap_track);
579
580 if (res_count == 0) {
581 log_printf (LOGSYS_LEVEL_INFO, "no resources configured.");
582 }
583}
584
585
586static void watchdog_timeout_apply (uint32_t new)
587{
588 struct watchdog_info ident;
589 uint32_t original_timeout = 0;
590
591 if (dog > 0) {
592 ioctl(dog, WDIOC_GETTIMEOUT, &original_timeout);
593 }
594
595 if (new == original_timeout) {
596 return;
597 }
598
599 watchdog_timeout = new;
600
601 if (dog > 0) {
602 ioctl(dog, WDIOC_GETSUPPORT, &ident);
603 if (ident.options & WDIOF_SETTIMEOUT) {
604 /* yay! the dog is trained.
605 */
606 ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout);
607 }
608 ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
609 }
610
611 if (watchdog_timeout == new) {
612 tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2;
613
614 /* reset the tickle timer in case it was reduced.
615 */
616 api->timer_delete (wd_timer);
617 api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
618 wd_tickle_fn, &wd_timer);
619
620 log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds", watchdog_timeout);
621 log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms", tickle_timeout);
622 } else {
624 "Could not change the Watchdog timeout from %d to %d seconds",
625 original_timeout, new);
626 }
627
628}
629
630static int setup_watchdog(void)
631{
632 struct watchdog_info ident;
633 char *str;
634
635 ENTER();
636
637 if (icmap_get_string("resources.watchdog_device", &str) == CS_OK) {
638 if (str[0] == 0 || strcmp (str, "off") == 0) {
639 log_printf (LOGSYS_LEVEL_WARNING, "Watchdog disabled by configuration");
640 free(str);
641 dog = -1;
642 return -1;
643 } else {
644 watchdog_device = str;
645 }
646 } else {
647 log_printf (LOGSYS_LEVEL_WARNING, "Watchdog not enabled by configuration");
648 dog = -1;
649 return -1;
650 }
651
652 if (access (watchdog_device, W_OK) != 0) {
653 log_printf (LOGSYS_LEVEL_WARNING, "No watchdog %s, try modprobe <a watchdog>", watchdog_device);
654 dog = -1;
655 return -1;
656 }
657
658 /* here goes, lets hope they have "Magic Close"
659 */
660 dog = open(watchdog_device, O_WRONLY);
661
662 if (dog == -1) {
663 log_printf (LOGSYS_LEVEL_WARNING, "Watchdog %s exists but couldn't be opened.", watchdog_device);
664 dog = -1;
665 return -1;
666 }
667
668 /* Right we have the dog.
669 * Lets see what breed it is.
670 */
671
672 ioctl(dog, WDIOC_GETSUPPORT, &ident);
673 log_printf (LOGSYS_LEVEL_INFO, "Watchdog %s is now being tickled by corosync.", watchdog_device);
674 log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
675
676 watchdog_timeout_apply (watchdog_timeout);
677
678 ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
679
680 return 0;
681}
682
683static void wd_top_level_key_changed(
684 int32_t event,
685 const char *key_name,
686 struct icmap_notify_value new_val,
687 struct icmap_notify_value old_val,
688 void *user_data)
689{
690 uint32_t tmp_value_32;
691
692 ENTER();
693
694 if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) == CS_OK) {
695 if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
696 watchdog_timeout_apply (tmp_value_32);
697 return;
698 }
699 }
700
702 "Set watchdog_timeout is out of range (2..120).");
703 icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
704}
705
706static void watchdog_timeout_get_initial (void)
707{
708 uint32_t tmp_value_32;
710
711 ENTER();
712
713 if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) != CS_OK) {
714 watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
715
716 icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
717 }
718 else {
719 if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
720 watchdog_timeout_apply (tmp_value_32);
721 }
722 else {
724 "Set watchdog_timeout is out of range (2..120).");
726 "use default value %d seconds.", WD_DEFAULT_TIMEOUT_SEC);
727 watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
728 icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
729 }
730 }
731
732 icmap_track_add("resources.watchdog_timeout", ICMAP_TRACK_MODIFY,
733 wd_top_level_key_changed, NULL, &icmap_track);
734
735}
736
737static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api)
738{
739
740 ENTER();
741
742 api = corosync_api;
743
744 watchdog_timeout_get_initial();
745
746 setup_watchdog();
747
748 wd_scan_resources();
749
750 return NULL;
751}
752
753static int wd_exec_exit_fn (void)
754{
755 char magic = 'V';
756 ENTER();
757
758 if (dog > 0) {
759 log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog.");
760 if (write (dog, &magic, 1) == -1) {
761 log_printf (LOGSYS_LEVEL_ERROR, "failed to write %c to dog(%d).", magic, dog);
762 }
763 }
764 return 0;
765}
766
767
qb_loop_timer_handle corosync_timer_handle_t
corosync_timer_handle_t
Definition: coroapi.h:74
@ CS_LIB_FLOW_CONTROL_NOT_REQUIRED
Definition: coroapi.h:153
#define MILLI_2_NANO_SECONDS
Definition: coroapi.h:105
@ WD_SERVICE
Definition: corodefs.h:51
#define CS_TIME_MS_IN_SEC
Definition: corotypes.h:133
#define CS_MAX(x, y)
Definition: corotypes.h:57
#define CS_FALSE
Definition: corotypes.h:53
#define CS_TRUE
Definition: corotypes.h:54
#define CS_MAX_NAME_LENGTH
Definition: corotypes.h:55
@ CS_OK
Definition: corotypes.h:99
QB_LIST_DECLARE(cpg_pd_list_head)
#define corosync_exit_error(err)
Definition: exec/util.h:72
@ COROSYNC_DONE_FATAL_ERR
Definition: exec/util.h:55
#define CS_FSM_CB_EVENT_PROCESS_NF
Definition: fsm.h:54
#define CS_FSM_CB_EVENT_STATE_SET
Definition: fsm.h:55
#define CS_FSM_CB_EVENT_STATE_SET_NF
Definition: fsm.h:56
#define ICMAP_TRACK_MODIFY
Definition: icmap.h:78
cs_error_t icmap_get_uint32(const char *key_name, uint32_t *u32)
Definition: icmap.c:892
#define ICMAP_TRACK_DELETE
Definition: icmap.h:77
cs_error_t icmap_track_add(const char *key_name, int32_t track_type, icmap_notify_fn_t notify_fn, void *user_data, icmap_track_t *icmap_track)
Add tracking function for given key_name.
Definition: icmap.c:1159
#define ICMAP_TRACK_PREFIX
Whole prefix is tracked, instead of key only (so "totem." tracking means that "totem....
Definition: icmap.h:85
icmap_iter_t icmap_iter_init(const char *prefix)
Initialize iterator with given prefix.
Definition: icmap.c:1089
const char * icmap_iter_next(icmap_iter_t iter, size_t *value_len, icmap_value_types_t *type)
Return next item in iterator iter.
Definition: icmap.c:1095
qb_map_iter_t * icmap_iter_t
Itterator type.
Definition: icmap.h:123
void icmap_iter_finalize(icmap_iter_t iter)
Finalize iterator.
Definition: icmap.c:1116
cs_error_t icmap_track_delete(icmap_track_t icmap_track)
Remove previously added track.
Definition: icmap.c:1204
cs_error_t icmap_set_uint64(const char *key_name, uint64_t value)
Definition: icmap.c:609
#define ICMAP_KEYNAME_MAXLEN
Maximum length of key in icmap.
Definition: icmap.h:48
cs_error_t icmap_set_uint32(const char *key_name, uint32_t value)
Definition: icmap.c:597
#define ICMAP_TRACK_ADD
Definition: icmap.h:76
cs_error_t icmap_get_uint64(const char *key_name, uint64_t *u64)
Definition: icmap.c:904
cs_error_t icmap_get_string(const char *key_name, char **str)
Shortcut for icmap_get for string type.
Definition: icmap.c:856
#define LOGSYS_LEVEL_ERROR
Definition: logsys.h:72
#define log_printf(level, format, args...)
Definition: logsys.h:332
#define LOGSYS_LEVEL_INFO
Definition: logsys.h:75
#define LOGSYS_LEVEL_CRIT
Definition: logsys.h:71
#define LOGSYS_LEVEL_WARNING
Definition: logsys.h:73
#define LOGSYS_LEVEL_DEBUG
Definition: logsys.h:76
#define ENTER
Definition: logsys.h:333
#define LOGSYS_LEVEL_ALERT
Definition: logsys.h:70
void * user_data
Definition: sam.c:127
The corosync_api_v1 struct.
Definition: coroapi.h:225
int(* timer_add_duration)(unsigned long long nanoseconds_in_future, void *data, void(*timer_nf)(void *data), corosync_timer_handle_t *handle)
Definition: coroapi.h:229
void(* timer_delete)(corosync_timer_handle_t timer_handle)
Definition: coroapi.h:241
The corosync_service_engine struct.
Definition: coroapi.h:490
const char * name
Definition: coroapi.h:491
Definition: fsm.h:58
int32_t curr_state
Definition: fsm.h:59
Definition: fsm.h:65
int32_t curr_entry
Definition: fsm.h:68
int32_t curr_state
Definition: fsm.h:67
cs_fsm_state_to_str_fn state_to_str
Definition: fsm.h:71
size_t entries
Definition: fsm.h:69
const char * name
Definition: fsm.h:66
struct cs_fsm_entry * table
Definition: fsm.h:70
cs_fsm_event_to_str_fn event_to_str
Definition: fsm.h:72
Structure passed as new_value and old_value in change callback.
Definition: icmap.h:91
Definition: wd.c:61
icmap_track_t icmap_track
Definition: wd.c:70
corosync_timer_handle_t check_timer
Definition: wd.c:68
char name[CS_MAX_NAME_LENGTH]
Definition: wd.c:64
char res_path[ICMAP_KEYNAME_MAXLEN]
Definition: wd.c:62
char * recovery
Definition: wd.c:63
time_t last_updated
Definition: wd.c:65
struct cs_fsm fsm
Definition: wd.c:66
uint64_t check_timeout
Definition: wd.c:69
unsigned short magic
Definition: totem.h:0
#define WD_MIN_TIMEOUT_MS
Definition: wd.c:85
LOGSYS_DECLARE_SUBSYS("WD")
const char * wd_running_str
Definition: wd.c:131
#define WD_DEFAULT_TIMEOUT_SEC
Definition: wd.c:83
struct corosync_service_engine wd_service_engine
Definition: wd.c:94
const char * wd_failure_str
Definition: wd.c:133
wd_resource_state
Definition: wd.c:120
@ WD_S_STOPPED
Definition: wd.c:123
@ WD_S_FAILED
Definition: wd.c:122
@ WD_S_RUNNING
Definition: wd.c:121
struct cs_fsm_entry wd_fsm_table[]
Definition: wd.c:137
const char * wd_config_changed_str
Definition: wd.c:135
wd_resource_state_t
Definition: wd.c:54
@ WD_RESOURCE_GOOD
Definition: wd.c:55
@ WD_RESOURCE_STATE_UNKNOWN
Definition: wd.c:57
@ WD_RESOURCE_FAILED
Definition: wd.c:56
@ WD_RESOURCE_NOT_MONITORED
Definition: wd.c:58
const char * wd_stopped_str
Definition: wd.c:134
#define WD_MAX_TIMEOUT_MS
Definition: wd.c:86
struct corosync_service_engine * wd_get_service_engine_ver0(void)
Definition: wd.c:146
wd_resource_event
Definition: wd.c:126
@ WD_E_FAILURE
Definition: wd.c:127
@ WD_E_CONFIG_CHANGED
Definition: wd.c:128
#define WD_DEFAULT_TIMEOUT_MS
Definition: wd.c:84
const char * wd_failed_str
Definition: wd.c:132