aboutsummaryrefslogtreecommitdiff
path: root/core/mce.c
blob: 47674abcb2eb005b61daf3c0c4e7f70636746feb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// SPDX-License-Identifier: Apache-2.0
/*
 * Machine Check Exceptions
 *
 * Copyright 2020 IBM Corp.
 */

#define pr_fmt(fmt)	"MCE: " fmt

#include <ras.h>
#include <opal.h>
#include <cpu.h>

#define SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42))

struct mce_ierror_table {
	unsigned long srr1_mask;
	unsigned long srr1_value;
	uint64_t type;
	const char *error_str;
};

static const struct mce_ierror_table mce_p9_ierror_table[] = {
{ 0x00000000081c0000, 0x0000000000040000,
  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA,
  "instruction fetch memory uncorrectable error", },
{ 0x00000000081c0000, 0x0000000000080000,
  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
  "instruction fetch SLB parity error", },
{ 0x00000000081c0000, 0x00000000000c0000,
  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
  "instruction fetch SLB multi-hit error", },
{ 0x00000000081c0000, 0x0000000000100000,
  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
  "instruction fetch ERAT multi-hit error", },
{ 0x00000000081c0000, 0x0000000000140000,
  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR,
  "instruction fetch TLB multi-hit error", },
{ 0x00000000081c0000, 0x0000000000180000,
  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access memory uncorrectable error", },
{ 0x00000000081c0000, 0x00000000001c0000,
  MCE_INSNFETCH | MCE_INVOLVED_EA,
  "instruction fetch to foreign address", },
{ 0x00000000081c0000, 0x0000000008000000,
  MCE_INSNFETCH | MCE_INVOLVED_EA,
  "instruction fetch foreign link time-out", },
{ 0x00000000081c0000, 0x0000000008040000,
  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access foreign link time-out", },
{ 0x00000000081c0000, 0x00000000080c0000,
  MCE_INSNFETCH | MCE_INVOLVED_EA,
  "instruction fetch real address error", },
{ 0x00000000081c0000, 0x0000000008100000,
  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access real address error", },
{ 0x00000000081c0000, 0x0000000008140000,
  MCE_LOADSTORE | MCE_IMPRECISE,
  "store real address asynchronous error", },
{ 0x00000000081c0000, 0x0000000008180000,
  MCE_LOADSTORE | MCE_IMPRECISE,
  "store foreign link time-out asynchronous error", },
{ 0x00000000081c0000, 0x00000000081c0000,
  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access to foreign address", },
{ 0 } };

static const struct mce_ierror_table mce_p10_ierror_table[] = {
{ 0x00000000081c0000, 0x0000000000040000,
  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA,
  "instruction fetch memory uncorrectable error", },
{ 0x00000000081c0000, 0x0000000000080000,
  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
  "instruction fetch SLB parity error", },
{ 0x00000000081c0000, 0x00000000000c0000,
  MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
  "instruction fetch SLB multi-hit error", },
{ 0x00000000081c0000, 0x0000000000100000,
  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
  "instruction fetch ERAT multi-hit error", },
{ 0x00000000081c0000, 0x0000000000140000,
  MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR,
  "instruction fetch TLB multi-hit error", },
{ 0x00000000081c0000, 0x0000000000180000,
  MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access memory uncorrectable error", },
{ 0x00000000081c0000, 0x00000000001c0000,
  MCE_INSNFETCH | MCE_INVOLVED_EA,
  "instruction fetch to control real address", },
{ 0x00000000081c0000, 0x00000000080c0000,
  MCE_INSNFETCH | MCE_INVOLVED_EA,
  "instruction fetch real address error", },
{ 0x00000000081c0000, 0x0000000008100000,
  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access real address error", },
{ 0x00000000081c0000, 0x0000000008140000,
  MCE_LOADSTORE | MCE_IMPRECISE,
  "store real address asynchronous error", },
{ 0x00000000081c0000, 0x00000000081c0000,
  MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "instruction fetch page table access to control real address", },
{ 0 } };

struct mce_derror_table {
	unsigned long dsisr_value;
	uint64_t type;
	const char *error_str;
};

static const struct mce_derror_table mce_p9_derror_table[] = {
{ 0x00008000,
  MCE_LOADSTORE | MCE_MEMORY_ERROR,
  "load/store memory uncorrectable error", },
{ 0x00004000,
  MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "load/store page table access memory uncorrectable error", },
{ 0x00002000,
  MCE_LOADSTORE | MCE_INVOLVED_EA,
  "load/store foreign link time-out", },
{ 0x00001000,
  MCE_LOADSTORE | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "load/store page table access foreign link time-out", },
{ 0x00000800,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
  "load/store ERAT multi-hit error", },
{ 0x00000400,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR,
  "load/store TLB multi-hit error", },
{ 0x00000200,
  MCE_LOADSTORE | MCE_TLBIE_ERROR,
  "TLBIE or TLBIEL instruction programming error", },
{ 0x00000100,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
  "load/store SLB parity error", },
{ 0x00000080,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
  "load/store SLB multi-hit error", },
{ 0x00000040,
  MCE_LOADSTORE | MCE_INVOLVED_EA,
  "load real address error", },
{ 0x00000020,
  MCE_LOADSTORE | MCE_TABLE_WALK,
  "load/store page table access real address error", },
{ 0x00000010,
  MCE_LOADSTORE | MCE_TABLE_WALK,
  "load/store page table access to foreign address", },
{ 0x00000008,
  MCE_LOADSTORE,
  "load/store to foreign address", },
{ 0 } };

static const struct mce_derror_table mce_p10_derror_table[] = {
{ 0x00008000,
  MCE_LOADSTORE | MCE_MEMORY_ERROR,
  "load/store memory uncorrectable error", },
{ 0x00004000,
  MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
  "load/store page table access memory uncorrectable error", },
{ 0x00000800,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
  "load/store ERAT multi-hit error", },
{ 0x00000400,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR,
  "load/store TLB multi-hit error", },
{ 0x00000200,
  MCE_TLBIE_ERROR,
  "TLBIE or TLBIEL instruction programming error", },
{ 0x00000100,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
  "load/store SLB parity error", },
{ 0x00000080,
  MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
  "load/store SLB multi-hit error", },
{ 0x00000040,
  MCE_LOADSTORE | MCE_INVOLVED_EA,
  "load real address error", },
{ 0x00000020,
  MCE_LOADSTORE | MCE_TABLE_WALK,
  "load/store page table access real address error", },
{ 0x00000010,
  MCE_LOADSTORE | MCE_TABLE_WALK,
  "load/store page table access to control real address", },
{ 0x00000008,
  MCE_LOADSTORE,
  "load/store to control real address", },
{ 0 } };

static void decode_ierror(const struct mce_ierror_table table[],
				uint64_t srr1,
				uint64_t *type,
				const char **error_str)
{
	int i;

	for (i = 0; table[i].srr1_mask; i++) {
		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
			continue;

		*type = table[i].type;
		*error_str = table[i].error_str;
	}
}

static void decode_derror(const struct mce_derror_table table[],
		uint32_t dsisr,
		uint64_t *type,
		const char **error_str)
{
	int i;

	for (i = 0; table[i].dsisr_value; i++) {
		if (!(dsisr & table[i].dsisr_value))
			continue;

		*type = table[i].type;
		*error_str = table[i].error_str;
	}
}

static void decode_mce_p9(uint64_t srr0, uint64_t srr1,
		uint32_t dsisr, uint64_t dar,
		uint64_t *type, const char **error_str,
		uint64_t *address)
{
	/*
	 * On POWER9 DD2.1 and below, it's possible to get a machine check
	 * caused by a paste instruction where only DSISR bit 25 is set. This
	 * will result in the MCE handler seeing an unknown event and the
	 * kernel crashing. An MCE that occurs like this is spurious, so we
	 * don't need to do anything in terms of servicing it. If there is
	 * something that needs to be serviced, the CPU will raise the MCE
	 * again with the correct DSISR so that it can be serviced properly.
	 * So detect this case and mark it as handled.
	 */
	if (SRR1_MC_LOADSTORE(srr1) && dsisr == 0x02000000) {
		*type = MCE_NO_ERROR;
		*error_str = "no error (superfluous machine check)";
		return;
	}

	/*
	 * Async machine check due to bad real address from store or foreign
	 * link time out comes with the load/store bit (PPC bit 42) set in
	 * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
	 * directed to the ierror table so it will find the cause (which
	 * describes it correctly as a store error).
	 */
	if (SRR1_MC_LOADSTORE(srr1) &&
			((srr1 & 0x081c0000) == 0x08140000 ||
			 (srr1 & 0x081c0000) == 0x08180000)) {
		srr1 &= ~PPC_BIT(42);
	}

	if (SRR1_MC_LOADSTORE(srr1)) {
		decode_derror(mce_p9_derror_table, dsisr, type, error_str);
		if (*type & MCE_INVOLVED_EA)
			*address = dar;
	} else {
		decode_ierror(mce_p9_ierror_table, srr1, type, error_str);
		if (*type & MCE_INVOLVED_EA)
			*address = srr0;
	}
}

static void decode_mce_p10(uint64_t srr0, uint64_t srr1,
		uint32_t dsisr, uint64_t dar,
		uint64_t *type, const char **error_str,
		uint64_t *address)
{
	/*
	 * Async machine check due to bad real address from store or foreign
	 * link time out comes with the load/store bit (PPC bit 42) set in
	 * SRR1, but the cause comes in SRR1 not DSISR. Clear bit 42 so we're
	 * directed to the ierror table so it will find the cause (which
	 * describes it correctly as a store error).
	 */
	if (SRR1_MC_LOADSTORE(srr1) &&
			(srr1 & 0x081c0000) == 0x08140000) {
		srr1 &= ~PPC_BIT(42);
	}

	if (SRR1_MC_LOADSTORE(srr1)) {
		decode_derror(mce_p10_derror_table, dsisr, type, error_str);
		if (*type & MCE_INVOLVED_EA)
			*address = dar;
	} else {
		decode_ierror(mce_p10_ierror_table, srr1, type, error_str);
		if (*type & MCE_INVOLVED_EA)
			*address = srr0;
	}
}

void decode_mce(uint64_t srr0, uint64_t srr1,
		uint32_t dsisr, uint64_t dar,
		uint64_t *type, const char **error_str,
		uint64_t *address)
{
	*type = MCE_UNKNOWN;
	*error_str = "unknown error";
	*address = 0;

	if (proc_gen == proc_gen_p9) {
		decode_mce_p9(srr0, srr1, dsisr, dar, type, error_str, address);
	} else if (proc_gen == proc_gen_p10) {
		decode_mce_p10(srr0, srr1, dsisr, dar, type, error_str, address);
	} else {
		*error_str = "unknown error (processor not supported)";
	}
}