Skip to content

Commit

Permalink
powerpc/hmi: Fix kernel hang when TB is in error state.
Browse files Browse the repository at this point in the history
On TOD/TB errors timebase register stops/freezes until HMI error recovery
gets TOD/TB back into running state. On successful recovery, TB starts
running again and udelay() that relies on TB value continues to function
properly. But in case when HMI fails to recover from TOD/TB errors, the
TB register stay freezed. With TB not running the __delay() function
keeps looping and never return. If __delay() is called while in panic
path then system hangs and never reboots after panic.

Signed-off-by: Mahesh Salgaonkar <[email protected]>
Signed-off-by: Michael Ellerman <[email protected]>
  • Loading branch information
maheshsal authored and mpe committed May 2, 2019
1 parent 0acb5f6 commit de26912
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 1 deletion.
10 changes: 10 additions & 0 deletions arch/powerpc/include/asm/opal-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
#define OPAL_SENSOR_GROUP_ENABLE 163
#define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164
#define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165
#define OPAL_HANDLE_HMI2 166
#define OPAL_NX_COPROC_INIT 167
#define OPAL_XIVE_GET_VP_STATE 170
#define OPAL_LAST 170
Expand Down Expand Up @@ -635,6 +636,15 @@ struct OpalHMIEvent {
} u;
};

/* OPAL_HANDLE_HMI2 out_flags */
enum {
OPAL_HMI_FLAGS_TB_RESYNC = (1ull << 0), /* Timebase has been resynced */
OPAL_HMI_FLAGS_DEC_LOST = (1ull << 1), /* DEC lost, needs to be reprogrammed */
OPAL_HMI_FLAGS_HDEC_LOST = (1ull << 2), /* HDEC lost, needs to be reprogrammed */
OPAL_HMI_FLAGS_TOD_TB_FAIL = (1ull << 3), /* TOD/TB recovery failed. */
OPAL_HMI_FLAGS_NEW_EVENT = (1ull << 63), /* An event has been created */
};

enum {
OPAL_P7IOC_DIAG_TYPE_NONE = 0,
OPAL_P7IOC_DIAG_TYPE_RGC = 1,
Expand Down
2 changes: 2 additions & 0 deletions arch/powerpc/include/asm/opal.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer,
int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data);
int64_t opal_sensor_read_u64(u32 sensor_hndl, int token, __be64 *sensor_data);
int64_t opal_handle_hmi(void);
int64_t opal_handle_hmi2(__be64 *out_flags);
int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
int64_t opal_unregister_dump_region(uint32_t id);
int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
Expand Down Expand Up @@ -359,6 +360,7 @@ int opal_power_control_init(void);
extern int opal_machine_check(struct pt_regs *regs);
extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
extern int opal_hmi_exception_early(struct pt_regs *regs);
extern int opal_hmi_exception_early2(struct pt_regs *regs);
extern int opal_handle_hmi_exception(struct pt_regs *regs);

extern void opal_shutdown(void);
Expand Down
2 changes: 2 additions & 0 deletions arch/powerpc/include/asm/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ extern unsigned long ppc_proc_freq;
extern unsigned long ppc_tb_freq;
#define DEFAULT_TB_FREQ 125000000UL

extern bool tb_invalid;

struct div_result {
u64 result_high;
u64 result_low;
Expand Down
9 changes: 9 additions & 0 deletions arch/powerpc/kernel/time.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ EXPORT_SYMBOL_GPL(ppc_proc_freq);
unsigned long ppc_tb_freq;
EXPORT_SYMBOL_GPL(ppc_tb_freq);

bool tb_invalid;

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Factor for converting from cputime_t (timebase ticks) to
Expand Down Expand Up @@ -459,6 +461,13 @@ void __delay(unsigned long loops)
diff += 1000000000;
spin_cpu_relax();
} while (diff < loops);
} else if (tb_invalid) {
/*
* TB is in error state and isn't ticking anymore.
* HMI handler was unable to recover from TB error.
* Return immediately, so that kernel won't get stuck here.
*/
spin_cpu_relax();
} else {
start = get_tbl();
while (get_tbl() - start < loops)
Expand Down
1 change: 1 addition & 0 deletions arch/powerpc/platforms/powernv/opal-call.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
OPAL_CALL(opal_handle_hmi2, OPAL_HANDLE_HMI2);
OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
Expand Down
21 changes: 21 additions & 0 deletions arch/powerpc/platforms/powernv/opal.c
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,27 @@ int opal_hmi_exception_early(struct pt_regs *regs)
return 0;
}

int opal_hmi_exception_early2(struct pt_regs *regs)
{
s64 rc;
__be64 out_flags;

/*
* call opal hmi handler.
* Check 64-bit flag mask to find out if an event was generated,
* and whether TB is still valid or not etc.
*/
rc = opal_handle_hmi2(&out_flags);
if (rc != OPAL_SUCCESS)
return 0;

if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
local_paca->hmi_event_available = 1;
if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
tb_invalid = true;
return 1;
}

/* HMI exception handler called in virtual mode during check_irq_replay. */
int opal_handle_hmi_exception(struct pt_regs *regs)
{
Expand Down
5 changes: 4 additions & 1 deletion arch/powerpc/platforms/powernv/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,10 @@ static void __init pnv_setup_machdep_opal(void)
/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
ppc_md.machine_check_exception = opal_machine_check;
ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
ppc_md.hmi_exception_early = opal_hmi_exception_early;
if (opal_check_token(OPAL_HANDLE_HMI2))
ppc_md.hmi_exception_early = opal_hmi_exception_early2;
else
ppc_md.hmi_exception_early = opal_hmi_exception_early;
ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
}

Expand Down

0 comments on commit de26912

Please sign in to comment.