Skip to content

Commit

Permalink
ipmi: add new IPMI nmi watchdog handling
Browse files Browse the repository at this point in the history
Convert over to the new NMI handling for getting IPMI watchdog timeouts via an
NMI.  This add config options to know if there is the ability to receive NMIs
and if it has an NMI post processing call.  Then it modifies the IPMI watchdog
to take advantage of this so that it can know if an NMI comes in.

It also adds testing that the IPMI NMI watchdog works.

Signed-off-by: Corey Minyard <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
cminyard authored and Linus Torvalds committed May 8, 2007
1 parent ee6cd5f commit f64da95
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 42 deletions.
5 changes: 5 additions & 0 deletions arch/i386/kernel/traps.c
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,11 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
*/
if (nmi_watchdog_tick(regs, reason))
return;
#endif
if (notify_die(DIE_NMI_POST, "nmi_post", regs, reason, 2, 0)
== NOTIFY_STOP)
return;
#ifdef CONFIG_X86_LOCAL_APIC
if (!do_nmi_callback(regs, smp_processor_id()))
#endif
unknown_nmi_error(reason, regs);
Expand Down
2 changes: 2 additions & 0 deletions arch/x86_64/kernel/traps.c
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,8 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
*/
if (nmi_watchdog_tick(regs,reason))
return;
if (notify_die(DIE_NMI_POST, "nmi_post", regs, reason, 2, 0)
== NOTIFY_STOP)
if (!do_nmi_callback(regs,cpu))
unknown_nmi_error(reason, regs);

Expand Down
136 changes: 94 additions & 42 deletions drivers/char/ipmi/ipmi_watchdog.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,19 @@
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include <asm/atomic.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>

#ifdef CONFIG_X86
/* This is ugly, but I've determined that x86 is the only architecture
that can reasonably support the IPMI NMI watchdog timeout at this
time. If another architecture adds this capability somehow, it
will have to be a somewhat different mechanism and I have no idea
how it will work. So in the unlikely event that another
architecture supports this, we can figure out a good generic
mechanism for it at that time. */
#include <asm/kdebug.h>
#define HAVE_DIE_NMI_POST
#endif

#define PFX "IPMI Watchdog: "
Expand Down Expand Up @@ -317,6 +327,11 @@ static unsigned char ipmi_version_minor;
/* If a pretimeout occurs, this is used to allow only one panic to happen. */
static atomic_t preop_panic_excl = ATOMIC_INIT(-1);

#ifdef HAVE_DIE_NMI_POST
static int testing_nmi;
static int nmi_handler_registered;
#endif

static int ipmi_heartbeat(void);
static void panic_halt_ipmi_heartbeat(void);

Expand Down Expand Up @@ -358,6 +373,10 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg *smi_msg,
int hbnow = 0;


/* These can be cleared as we are setting the timeout. */
ipmi_start_timer_on_heartbeat = 0;
pretimeout_since_last_heartbeat = 0;

data[0] = 0;
WDOG_SET_TIMER_USE(data[0], WDOG_TIMER_USE_SMS_OS);

Expand Down Expand Up @@ -432,13 +451,12 @@ static int ipmi_set_timeout(int do_heartbeat)

wait_for_completion(&set_timeout_wait);

mutex_unlock(&set_timeout_lock);

if ((do_heartbeat == IPMI_SET_TIMEOUT_FORCE_HB)
|| ((send_heartbeat_now)
&& (do_heartbeat == IPMI_SET_TIMEOUT_HB_IF_NECESSARY)))
{
rv = ipmi_heartbeat();
}
mutex_unlock(&set_timeout_lock);

out:
return rv;
Expand Down Expand Up @@ -518,20 +536,17 @@ static int ipmi_heartbeat(void)
int rv;
struct ipmi_system_interface_addr addr;

if (ipmi_ignore_heartbeat) {
if (ipmi_ignore_heartbeat)
return 0;
}

if (ipmi_start_timer_on_heartbeat) {
ipmi_start_timer_on_heartbeat = 0;
ipmi_watchdog_state = action_val;
return ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
} else if (pretimeout_since_last_heartbeat) {
/* A pretimeout occurred, make sure we set the timeout.
We don't want to set the action, though, we want to
leave that alone (thus it can't be combined with the
above operation. */
pretimeout_since_last_heartbeat = 0;
return ipmi_set_timeout(IPMI_SET_TIMEOUT_HB_IF_NECESSARY);
}

Expand Down Expand Up @@ -919,6 +934,45 @@ static void ipmi_register_watchdog(int ipmi_intf)
printk(KERN_CRIT PFX "Unable to register misc device\n");
}

#ifdef HAVE_DIE_NMI_POST
if (nmi_handler_registered) {
int old_pretimeout = pretimeout;
int old_timeout = timeout;
int old_preop_val = preop_val;

/* Set the pretimeout to go off in a second and give
ourselves plenty of time to stop the timer. */
ipmi_watchdog_state = WDOG_TIMEOUT_RESET;
preop_val = WDOG_PREOP_NONE; /* Make sure nothing happens */
pretimeout = 99;
timeout = 100;

testing_nmi = 1;

rv = ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
if (rv) {
printk(KERN_WARNING PFX "Error starting timer to"
" test NMI: 0x%x. The NMI pretimeout will"
" likely not work\n", rv);
rv = 0;
goto out_restore;
}

msleep(1500);

if (testing_nmi != 2) {
printk(KERN_WARNING PFX "IPMI NMI didn't seem to"
" occur. The NMI pretimeout will"
" likely not work\n");
}
out_restore:
testing_nmi = 0;
preop_val = old_preop_val;
pretimeout = old_pretimeout;
timeout = old_timeout;
}
#endif

out:
up_write(&register_sem);

Expand All @@ -928,6 +982,10 @@ static void ipmi_register_watchdog(int ipmi_intf)
ipmi_watchdog_state = action_val;
ipmi_set_timeout(IPMI_SET_TIMEOUT_FORCE_HB);
printk(KERN_INFO PFX "Starting now!\n");
} else {
/* Stop the timer now. */
ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
}
}

Expand Down Expand Up @@ -964,17 +1022,28 @@ static void ipmi_unregister_watchdog(int ipmi_intf)
up_write(&register_sem);
}

#ifdef HAVE_NMI_HANDLER
#ifdef HAVE_DIE_NMI_POST
static int
ipmi_nmi(void *dev_id, int cpu, int handled)
ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
{
if (val != DIE_NMI_POST)
return NOTIFY_OK;

if (testing_nmi) {
testing_nmi = 2;
return NOTIFY_STOP;
}

/* If we are not expecting a timeout, ignore it. */
if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE)
return NOTIFY_DONE;
return NOTIFY_OK;

if (preaction_val != WDOG_PRETIMEOUT_NMI)
return NOTIFY_OK;

/* If no one else handled the NMI, we assume it was the IPMI
watchdog. */
if ((!handled) && (preop_val == WDOG_PREOP_PANIC)) {
if (preop_val == WDOG_PREOP_PANIC) {
/* On some machines, the heartbeat will give
an error and not work unless we re-enable
the timer. So do so. */
Expand All @@ -983,18 +1052,12 @@ ipmi_nmi(void *dev_id, int cpu, int handled)
panic(PFX "pre-timeout");
}

return NOTIFY_DONE;
return NOTIFY_STOP;
}

static struct nmi_handler ipmi_nmi_handler =
{
.link = LIST_HEAD_INIT(ipmi_nmi_handler.link),
.dev_name = "ipmi_watchdog",
.dev_id = NULL,
.handler = ipmi_nmi,
.priority = 0, /* Call us last. */
static struct notifier_block ipmi_nmi_handler = {
.notifier_call = ipmi_nmi
};
int nmi_handler_registered;
#endif

static int wdog_reboot_handler(struct notifier_block *this,
Expand Down Expand Up @@ -1111,7 +1174,7 @@ static int preaction_op(const char *inval, char *outval)
preaction_val = WDOG_PRETIMEOUT_NONE;
else if (strcmp(inval, "pre_smi") == 0)
preaction_val = WDOG_PRETIMEOUT_SMI;
#ifdef HAVE_NMI_HANDLER
#ifdef HAVE_DIE_NMI_POST
else if (strcmp(inval, "pre_nmi") == 0)
preaction_val = WDOG_PRETIMEOUT_NMI;
#endif
Expand Down Expand Up @@ -1145,7 +1208,7 @@ static int preop_op(const char *inval, char *outval)

static void check_parms(void)
{
#ifdef HAVE_NMI_HANDLER
#ifdef HAVE_DIE_NMI_POST
int do_nmi = 0;
int rv;

Expand All @@ -1158,28 +1221,17 @@ static void check_parms(void)
preop_op("preop_none", NULL);
do_nmi = 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING PFX "nmi_watchdog is set to IO APIC"
" mode (value is %d), that is incompatible"
" with using NMI in the IPMI watchdog."
" Disabling IPMI nmi pretimeout.\n",
nmi_watchdog);
preaction_val = WDOG_PRETIMEOUT_NONE;
do_nmi = 0;
}
#endif
}
if (do_nmi && !nmi_handler_registered) {
rv = request_nmi(&ipmi_nmi_handler);
rv = register_die_notifier(&ipmi_nmi_handler);
if (rv) {
printk(KERN_WARNING PFX
"Can't register nmi handler\n");
return;
} else
nmi_handler_registered = 1;
} else if (!do_nmi && nmi_handler_registered) {
release_nmi(&ipmi_nmi_handler);
unregister_die_notifier(&ipmi_nmi_handler);
nmi_handler_registered = 0;
}
#endif
Expand Down Expand Up @@ -1215,9 +1267,9 @@ static int __init ipmi_wdog_init(void)

rv = ipmi_smi_watcher_register(&smi_watcher);
if (rv) {
#ifdef HAVE_NMI_HANDLER
if (preaction_val == WDOG_PRETIMEOUT_NMI)
release_nmi(&ipmi_nmi_handler);
#ifdef HAVE_DIE_NMI_POST
if (nmi_handler_registered)
unregister_die_notifier(&ipmi_nmi_handler);
#endif
atomic_notifier_chain_unregister(&panic_notifier_list,
&wdog_panic_notifier);
Expand All @@ -1236,9 +1288,9 @@ static void __exit ipmi_wdog_exit(void)
ipmi_smi_watcher_unregister(&smi_watcher);
ipmi_unregister_watchdog(watchdog_ifnum);

#ifdef HAVE_NMI_HANDLER
#ifdef HAVE_DIE_NMI_POST
if (nmi_handler_registered)
release_nmi(&ipmi_nmi_handler);
unregister_die_notifier(&ipmi_nmi_handler);
#endif

atomic_notifier_chain_unregister(&panic_notifier_list,
Expand Down
1 change: 1 addition & 0 deletions include/asm-i386/kdebug.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ enum die_val {
DIE_GPF,
DIE_CALL,
DIE_NMI_IPI,
DIE_NMI_POST,
DIE_PAGE_FAULT,
};

Expand Down
1 change: 1 addition & 0 deletions include/asm-x86_64/kdebug.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum die_val {
DIE_GPF,
DIE_CALL,
DIE_NMI_IPI,
DIE_NMI_POST,
DIE_PAGE_FAULT,
};

Expand Down

0 comments on commit f64da95

Please sign in to comment.