1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char * const f15h_ic_mce_desc[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
87 static const char * const f15h_cu_mce_desc[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
101 "PRB address parity error"
104 static const char *nb_mce_desc[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const fr_ex_mce_desc[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
139 "DE correctable error could not be corrected"
142 static bool f12h_dc_mce(u16 ec, u8 xec)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll == LL_L1)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
160 static bool f10h_dc_mce(u16 ec, u8 xec)
162 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
163 pr_cont("during data scrub.\n");
166 return f12h_dc_mce(ec, xec);
169 static bool k8_dc_mce(u16 ec, u8 xec)
172 pr_cont("during system linefill.\n");
176 return f10h_dc_mce(ec, xec);
179 static bool f14h_dc_mce(u16 ec, u8 xec)
186 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4 == R4_DRD ? "load/hw prf" : "store"));
196 pr_cont("Copyback parity error on a tag miss.\n");
199 pr_cont("Tag parity error during snoop.\n");
204 } else if (BUS_ERROR(ec)) {
206 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
209 pr_cont("System read data error on a ");
213 pr_cont("TLB reload.\n");
231 static bool f15h_dc_mce(u16 ec, u8 xec)
239 pr_cont("Data Array access error.\n");
243 pr_cont("UC error during a linefill from L2/NB.\n");
248 pr_cont("STQ access error.\n");
252 pr_cont("SCB access error.\n");
256 pr_cont("Tag error.\n");
260 pr_cont("LDQ access error.\n");
266 } else if (BUS_ERROR(ec)) {
269 pr_cont("System Read Data Error.\n");
271 pr_cont(" Internal error condition type %d.\n", xec);
278 static void amd_decode_dc_mce(struct mce *m)
280 u16 ec = EC(m->status);
281 u8 xec = XEC(m->status, xec_mask);
283 pr_emerg(HW_ERR "Data Cache Error: ");
285 /* TLB error signatures are the same across families */
287 if (TT(ec) == TT_DATA) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec),
289 ((xec == 2) ? "locked miss"
290 : (xec ? "multimatch" : "parity")));
293 } else if (fam_ops->dc_mce(ec, xec))
296 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
299 static bool k8_ic_mce(u16 ec, u8 xec)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll == 0x1) {
312 pr_cont("Parity error during data load.\n");
316 pr_cont("Copyback Parity/Victim error.\n");
320 pr_cont("Tag Snoop error.\n");
333 static bool f14h_ic_mce(u16 ec, u8 xec)
339 if (TT(ec) != 0 || LL(ec) != 1)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4 == R4_SNOOP)
345 pr_cont("Tag error during snoop/victimization.\n");
352 static bool f15h_ic_mce(u16 ec, u8 xec)
361 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
365 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
369 pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]);
373 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
382 static void amd_decode_ic_mce(struct mce *m)
384 u16 ec = EC(m->status);
385 u8 xec = XEC(m->status, xec_mask);
387 pr_emerg(HW_ERR "Instruction Cache Error: ");
390 pr_cont("%s TLB %s.\n", LL_MSG(ec),
391 (xec ? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec)) {
393 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
395 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
396 } else if (fam_ops->ic_mce(ec, xec))
399 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
402 static void amd_decode_bu_mce(struct mce *m)
404 u16 ec = EC(m->status);
405 u8 xec = XEC(m->status, xec_mask);
407 pr_emerg(HW_ERR "Bus Unit Error");
410 pr_cont(" in the write data buffers.\n");
412 pr_cont(" in the victim data buffers.\n");
413 else if (xec == 0x2 && MEM_ERROR(ec))
414 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
415 else if (xec == 0x0) {
417 pr_cont(": %s error in a Page Descriptor Cache or "
418 "Guest TLB.\n", TT_MSG(ec));
419 else if (BUS_ERROR(ec))
420 pr_cont(": %s/ECC error in data read from NB: %s.\n",
421 R4_MSG(ec), PP_MSG(ec));
422 else if (MEM_ERROR(ec)) {
426 pr_cont(": %s error during data copyback.\n",
429 pr_cont(": %s parity/ECC error during data "
430 "access from L2.\n", R4_MSG(ec));
441 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
444 static void amd_decode_cu_mce(struct mce *m)
446 u16 ec = EC(m->status);
447 u8 xec = XEC(m->status, xec_mask);
449 pr_emerg(HW_ERR "Combined Unit Error: ");
453 pr_cont("Data parity TLB read error.\n");
455 pr_cont("Poison data provided for TLB fill.\n");
458 } else if (BUS_ERROR(ec)) {
462 pr_cont("Error during attempted NB data read.\n");
463 } else if (MEM_ERROR(ec)) {
466 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
470 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
481 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
484 static void amd_decode_ls_mce(struct mce *m)
486 u16 ec = EC(m->status);
487 u8 xec = XEC(m->status, xec_mask);
489 if (boot_cpu_data.x86 >= 0x14) {
490 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
491 " please report on LKML.\n");
495 pr_emerg(HW_ERR "Load Store Error");
500 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
503 pr_cont(" during %s.\n", R4_MSG(ec));
510 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
513 void amd_decode_nb_mce(struct mce *m)
515 struct cpuinfo_x86 *c = &boot_cpu_data;
516 int node_id = amd_get_nb_id(m->extcpu);
517 u16 ec = EC(m->status);
518 u8 xec = XEC(m->status, 0x1f);
521 pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id);
526 /* special handling for DRAM ECCs */
527 if (xec == 0x0 || xec == 0x8) {
528 /* no ECCs on F11h */
532 pr_cont("%s.\n", nb_mce_desc[xec]);
535 nb_bus_decoder(node_id, m);
542 pr_cont("GART Table Walk data error.\n");
543 else if (BUS_ERROR(ec))
544 pr_cont("DMA Exclusion Vector Table Walk error.\n");
550 if (boot_cpu_data.x86 == 0x15)
551 pr_cont("Compute Unit Data Error.\n");
564 pr_cont("%s.\n", nb_mce_desc[xec - offset]);
568 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
570 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
572 static void amd_decode_fr_mce(struct mce *m)
574 struct cpuinfo_x86 *c = &boot_cpu_data;
575 u8 xec = XEC(m->status, xec_mask);
577 if (c->x86 == 0xf || c->x86 == 0x11)
580 if (c->x86 != 0x15 && xec != 0x0)
583 pr_emerg(HW_ERR "%s Error: ",
584 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
586 if (xec == 0x0 || xec == 0xc)
587 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
589 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
596 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
599 static void amd_decode_fp_mce(struct mce *m)
601 u8 xec = XEC(m->status, xec_mask);
603 pr_emerg(HW_ERR "Floating Point Unit Error: ");
607 pr_cont("Free List");
611 pr_cont("Physical Register File");
615 pr_cont("Retire Queue");
619 pr_cont("Scheduler table");
623 pr_cont("Status Register File");
631 pr_cont(" parity error.\n");
636 pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
639 static inline void amd_decode_err_code(u16 ec)
642 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
645 pr_cont(", mem/io: %s", II_MSG(ec));
647 pr_cont(", tx: %s", TT_MSG(ec));
649 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
650 pr_cont(", mem-tx: %s", R4_MSG(ec));
653 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
660 * Filter out unwanted MCE signatures here.
662 static bool amd_filter_mce(struct mce *m)
664 u8 xec = (m->status >> 16) & 0x1f;
667 * NB GART TLB error reporting is disabled by default.
669 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
675 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
677 struct mce *m = (struct mce *)data;
678 struct cpuinfo_x86 *c = &boot_cpu_data;
681 if (amd_filter_mce(m))
684 pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s",
686 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
687 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
688 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
689 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
690 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
694 ((m->status & BIT_64(44)) ? "Deferred" : "-"),
695 ((m->status & BIT_64(43)) ? "Poison" : "-"));
697 /* do the two bits[14:13] together */
698 ecc = (m->status >> 45) & 0x3;
700 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
702 pr_cont("]: 0x%016llx\n", m->status);
704 if (m->status & MCI_STATUS_ADDRV)
705 pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
709 amd_decode_dc_mce(m);
713 amd_decode_ic_mce(m);
718 amd_decode_cu_mce(m);
720 amd_decode_bu_mce(m);
724 amd_decode_ls_mce(m);
728 amd_decode_nb_mce(m);
732 amd_decode_fr_mce(m);
736 amd_decode_fp_mce(m);
743 amd_decode_err_code(m->status & 0xffff);
747 EXPORT_SYMBOL_GPL(amd_decode_mce);
749 static struct notifier_block amd_mce_dec_nb = {
750 .notifier_call = amd_decode_mce,
753 static int __init mce_amd_init(void)
755 struct cpuinfo_x86 *c = &boot_cpu_data;
757 if (c->x86_vendor != X86_VENDOR_AMD)
760 if ((c->x86 < 0xf || c->x86 > 0x12) &&
761 (c->x86 != 0x14 || c->x86_model > 0xf) &&
762 (c->x86 != 0x15 || c->x86_model > 0xf))
765 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
771 fam_ops->dc_mce = k8_dc_mce;
772 fam_ops->ic_mce = k8_ic_mce;
776 fam_ops->dc_mce = f10h_dc_mce;
777 fam_ops->ic_mce = k8_ic_mce;
781 fam_ops->dc_mce = k8_dc_mce;
782 fam_ops->ic_mce = k8_ic_mce;
786 fam_ops->dc_mce = f12h_dc_mce;
787 fam_ops->ic_mce = k8_ic_mce;
791 nb_err_cpumask = 0x3;
792 fam_ops->dc_mce = f14h_dc_mce;
793 fam_ops->ic_mce = f14h_ic_mce;
798 fam_ops->dc_mce = f15h_dc_mce;
799 fam_ops->ic_mce = f15h_ic_mce;
803 printk(KERN_WARNING "Huh? What family is that: %d?!\n", c->x86);
808 pr_info("MCE: In-kernel MCE decoding enabled.\n");
810 mce_register_decode_chain(&amd_mce_dec_nb);
814 early_initcall(mce_amd_init);
817 static void __exit mce_amd_exit(void)
819 mce_unregister_decode_chain(&amd_mce_dec_nb);
823 MODULE_DESCRIPTION("AMD MCE decoder");
824 MODULE_ALIAS("edac-mce-amd");
825 MODULE_LICENSE("GPL");
826 module_exit(mce_amd_exit);