arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/smp_lock.h>
  13 #include <linux/string.h>
  14 #include <linux/rcupdate.h>
  15 #include <linux/kallsyms.h>
  16 #include <linux/sysdev.h>
  17 #include <linux/miscdevice.h>
  18 #include <linux/fs.h>
  19 #include <linux/capability.h>
  20 #include <linux/cpu.h>
  21 #include <linux/percpu.h>
  22 #include <linux/poll.h>
  23 #include <linux/thread_info.h>
  24 #include <linux/ctype.h>
  25 #include <linux/kmod.h>
  26 #include <linux/kdebug.h>
  27 #include <asm/processor.h>
  28 #include <asm/msr.h>
  29 #include <asm/mce.h>
  30 #include <asm/uaccess.h>
  31 #include <asm/smp.h>
  32 #include <asm/idle.h>
  33
  34 #define MISC_MCELOG_MINOR 227
  35 #define NR_SYSFS_BANKS 6
  36
  37 atomic_t mce_entry;
  38
  39 static int mce_dont_init;
  40
  41 /*
  42  * Tolerant levels:
  43  *   0: always panic on uncorrected errors, log corrected errors
  44  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  45  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  46  *   3: never panic or SIGBUS, log all errors (for testing only)
  47  */
  48 static int tolerant = 1;
  49 static int banks;
  50 static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
  51 static unsigned long notify_user;
  52 static int rip_msr;
  53 static int mce_bootlog = -1;
  54 static atomic_t mce_events;
  55
  56 static char trigger[128];
  57 static char *trigger_argv[2] = { trigger, NULL };
  58
  59 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  60
  61 /*
  62  * Lockless MCE logging infrastructure.
  63  * This avoids deadlocks on printk locks without having to break locks. Also
  64  * separate MCEs from kernel messages to avoid bogus bug reports.
  65  */
  66
  67 static struct mce_log mcelog = {
  68         MCE_LOG_SIGNATURE,
  69         MCE_LOG_LEN,
  70 };
  71
  72 void mce_log(struct mce *mce)
  73 {
  74         unsigned next, entry;
  75         atomic_inc(&mce_events);
  76         mce->finished = 0;
  77         wmb();
  78         for (;;) {
  79                 entry = rcu_dereference(mcelog.next);
  80                 for (;;) {
  81                         /* When the buffer fills up discard new entries. Assume
  82                            that the earlier errors are the more interesting. */
  83                         if (entry >= MCE_LOG_LEN) {
  84                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
  85                                 return;
  86                         }
  87                         /* Old left over entry. Skip. */
  88                         if (mcelog.entry[entry].finished) {
  89                                 entry++;
  90                                 continue;
  91                         }
  92                         break;
  93                 }
  94                 smp_rmb();
  95                 next = entry + 1;
  96                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  97                         break;
  98         }
  99         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 100         wmb();
 101         mcelog.entry[entry].finished = 1;
 102         wmb();
 103
 104         set_bit(0, &notify_user);
 105 }
 106
 107 static void print_mce(struct mce *m)
 108 {
 109         printk(KERN_EMERG "\n"
 110                KERN_EMERG "HARDWARE ERROR\n"
 111                KERN_EMERG
 112                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 113                m->cpu, m->mcgstatus, m->bank, m->status);
 114         if (m->ip) {
 115                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 116                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 117                        m->cs, m->ip);
 118                 if (m->cs == __KERNEL_CS)
 119                         print_symbol("{%s}", m->ip);
 120                 printk("\n");
 121         }
 122         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 123         if (m->addr)
 124                 printk("ADDR %Lx ", m->addr);
 125         if (m->misc)
 126                 printk("MISC %Lx ", m->misc);
 127         printk("\n");
 128         printk(KERN_EMERG "This is not a software problem!\n");
 129         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 130                "and contact your hardware vendor\n");
 131 }
 132
 133 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 134 {
 135         int i;
 136
 137         oops_begin();
 138         for (i = 0; i < MCE_LOG_LEN; i++) {
 139                 unsigned long tsc = mcelog.entry[i].tsc;
 140
 141                 if (time_before(tsc, start))
 142                         continue;
 143                 print_mce(&mcelog.entry[i]);
 144                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 145                         backup = NULL;
 146         }
 147         if (backup)
 148                 print_mce(backup);
 149         panic(msg);
 150 }
 151
 152 static int mce_available(struct cpuinfo_x86 *c)
 153 {
 154         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 155 }
 156
 157 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 158 {
 159         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 160                 m->ip = regs->ip;
 161                 m->cs = regs->cs;
 162         } else {
 163                 m->ip = 0;
 164                 m->cs = 0;
 165         }
 166         if (rip_msr) {
 167                 /* Assume the RIP in the MSR is exact. Is this true? */
 168                 m->mcgstatus |= MCG_STATUS_EIPV;
 169                 rdmsrl(rip_msr, m->ip);
 170                 m->cs = 0;
 171         }
 172 }
 173
 174 /*
 175  * The actual machine check handler
 176  */
 177 void do_machine_check(struct pt_regs * regs, long error_code)
 178 {
 179         struct mce m, panicm;
 180         u64 mcestart = 0;
 181         int i;
 182         int panicm_found = 0;
 183         /*
 184          * If no_way_out gets set, there is no safe way to recover from this
 185          * MCE.  If tolerant is cranked up, we'll try anyway.
 186          */
 187         int no_way_out = 0;
 188         /*
 189          * If kill_it gets set, there might be a way to recover from this
 190          * error.
 191          */
 192         int kill_it = 0;
 193
 194         atomic_inc(&mce_entry);
 195
 196         if ((regs
 197              && notify_die(DIE_NMI, "machine check", regs, error_code,
 198                            18, SIGKILL) == NOTIFY_STOP)
 199             || !banks)
 200                 goto out2;
 201
 202         memset(&m, 0, sizeof(struct mce));
 203         m.cpu = smp_processor_id();
 204         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 205         /* if the restart IP is not valid, we're done for */
 206         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 207                 no_way_out = 1;
 208
 209         rdtscll(mcestart);
 210         barrier();
 211
 212         for (i = 0; i < banks; i++) {
 213                 if (i < NR_SYSFS_BANKS && !bank[i])
 214                         continue;
 215
 216                 m.misc = 0;
 217                 m.addr = 0;
 218                 m.bank = i;
 219                 m.tsc = 0;
 220
 221                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 222                 if ((m.status & MCI_STATUS_VAL) == 0)
 223                         continue;
 224
 225                 if (m.status & MCI_STATUS_EN) {
 226                         /* if PCC was set, there's no way out */
 227                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 228                         /*
 229                          * If this error was uncorrectable and there was
 230                          * an overflow, we're in trouble.  If no overflow,
 231                          * we might get away with just killing a task.
 232                          */
 233                         if (m.status & MCI_STATUS_UC) {
 234                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 235                                         no_way_out = 1;
 236                                 kill_it = 1;
 237                         }
 238                 }
 239
 240                 if (m.status & MCI_STATUS_MISCV)
 241                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 242                 if (m.status & MCI_STATUS_ADDRV)
 243                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 244
 245                 mce_get_rip(&m, regs);
 246                 if (error_code >= 0)
 247                         rdtscll(m.tsc);
 248                 if (error_code != -2)
 249                         mce_log(&m);
 250
 251                 /* Did this bank cause the exception? */
 252                 /* Assume that the bank with uncorrectable errors did it,
 253                    and that there is only a single one. */
 254                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 255                         panicm = m;
 256                         panicm_found = 1;
 257                 }
 258
 259                 add_taint(TAINT_MACHINE_CHECK);
 260         }
 261
 262         /* Never do anything final in the polling timer */
 263         if (!regs)
 264                 goto out;
 265
 266         /* If we didn't find an uncorrectable error, pick
 267            the last one (shouldn't happen, just being safe). */
 268         if (!panicm_found)
 269                 panicm = m;
 270
 271         /*
 272          * If we have decided that we just CAN'T continue, and the user
 273          *  has not set tolerant to an insane level, give up and die.
 274          */
 275         if (no_way_out && tolerant < 3)
 276                 mce_panic("Machine check", &panicm, mcestart);
 277
 278         /*
 279          * If the error seems to be unrecoverable, something should be
 280          * done.  Try to kill as little as possible.  If we can kill just
 281          * one task, do that.  If the user has set the tolerance very
 282          * high, don't try to do anything at all.
 283          */
 284         if (kill_it && tolerant < 3) {
 285                 int user_space = 0;
 286
 287                 /*
 288                  * If the EIPV bit is set, it means the saved IP is the
 289                  * instruction which caused the MCE.
 290                  */
 291                 if (m.mcgstatus & MCG_STATUS_EIPV)
 292                         user_space = panicm.ip && (panicm.cs & 3);
 293
 294                 /*
 295                  * If we know that the error was in user space, send a
 296                  * SIGBUS.  Otherwise, panic if tolerance is low.
 297                  *
 298                  * force_sig() takes an awful lot of locks and has a slight
 299                  * risk of deadlocking.
 300                  */
 301                 if (user_space) {
 302                         force_sig(SIGBUS, current);
 303                 } else if (panic_on_oops || tolerant < 2) {
 304                         mce_panic("Uncorrected machine check",
 305                                 &panicm, mcestart);
 306                 }
 307         }
 308
 309         /* notify userspace ASAP */
 310         set_thread_flag(TIF_MCE_NOTIFY);
 311
 312  out:
 313         /* the last thing we do is clear state */
 314         for (i = 0; i < banks; i++)
 315                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 316         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 317  out2:
 318         atomic_dec(&mce_entry);
 319 }
 320
 321 #ifdef CONFIG_X86_MCE_INTEL
 322 /***
 323  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 324  * @cpu: The CPU on which the event occurred.
 325  * @status: Event status information
 326  *
 327  * This function should be called by the thermal interrupt after the
 328  * event has been processed and the decision was made to log the event
 329  * further.
 330  *
 331  * The status parameter will be saved to the 'status' field of 'struct mce'
 332  * and historically has been the register value of the
 333  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 334  */
 335 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 336 {
 337         struct mce m;
 338
 339         memset(&m, 0, sizeof(m));
 340         m.cpu = cpu;
 341         m.bank = MCE_THERMAL_BANK;
 342         m.status = status;
 343         rdtscll(m.tsc);
 344         mce_log(&m);
 345 }
 346 #endif /* CONFIG_X86_MCE_INTEL */
 347
 348 /*
 349  * Periodic polling timer for "silent" machine check errors.  If the
 350  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 351  * errors, poll 2x slower (up to check_interval seconds).
 352  */
 353
 354 static int check_interval = 5 * 60; /* 5 minutes */
 355 static int next_interval; /* in jiffies */
 356 static void mcheck_timer(struct work_struct *work);
 357 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 358
 359 static void mcheck_check_cpu(void *info)
 360 {
 361         if (mce_available(&current_cpu_data))
 362                 do_machine_check(NULL, 0);
 363 }
 364
 365 static void mcheck_timer(struct work_struct *work)
 366 {
 367         on_each_cpu(mcheck_check_cpu, NULL, 1);
 368
 369         /*
 370          * Alert userspace if needed.  If we logged an MCE, reduce the
 371          * polling interval, otherwise increase the polling interval.
 372          */
 373         if (mce_notify_user()) {
 374                 next_interval = max(next_interval/2, HZ/100);
 375         } else {
 376                 next_interval = min(next_interval * 2,
 377                                 (int)round_jiffies_relative(check_interval*HZ));
 378         }
 379
 380         schedule_delayed_work(&mcheck_work, next_interval);
 381 }
 382
 383 static void mce_do_trigger(struct work_struct *work)
 384 {
 385         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 386 }
 387
 388 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 389
 390 /*
 391  * Notify the user(s) about new machine check events.
 392  * Can be called from interrupt context, but not from machine check/NMI
 393  * context.
 394  */
 395 int mce_notify_user(void)
 396 {
 397         clear_thread_flag(TIF_MCE_NOTIFY);
 398         if (test_and_clear_bit(0, &notify_user)) {
 399                 static unsigned long last_print;
 400                 unsigned long now = jiffies;
 401
 402                 wake_up_interruptible(&mce_wait);
 403
 404                 /*
 405                  * There is no risk of missing notifications because
 406                  * work_pending is always cleared before the function is
 407                  * executed.
 408                  */
 409                 if (trigger[0] && !work_pending(&mce_trigger_work))
 410                         schedule_work(&mce_trigger_work);
 411
 412                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 413                         last_print = now;
 414                         printk(KERN_INFO "Machine check events logged\n");
 415                 }
 416
 417                 return 1;
 418         }
 419         return 0;
 420 }
 421
 422 /* see if the idle task needs to notify userspace */
 423 static int
 424 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 425 {
 426         /* IDLE_END should be safe - interrupts are back on */
 427         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 428                 mce_notify_user();
 429
 430         return NOTIFY_OK;
 431 }
 432
 433 static struct notifier_block mce_idle_notifier = {
 434         .notifier_call = mce_idle_callback,
 435 };
 436
 437 static __init int periodic_mcheck_init(void)
 438 {
 439         next_interval = check_interval * HZ;
 440         if (next_interval)
 441                 schedule_delayed_work(&mcheck_work,
 442                                       round_jiffies_relative(next_interval));
 443         idle_notifier_register(&mce_idle_notifier);
 444         return 0;
 445 }
 446 __initcall(periodic_mcheck_init);
 447
 448
 449 /*
 450  * Initialize Machine Checks for a CPU.
 451  */
 452 static void mce_init(void *dummy)
 453 {
 454         u64 cap;
 455         int i;
 456
 457         rdmsrl(MSR_IA32_MCG_CAP, cap);
 458         banks = cap & 0xff;
 459         if (banks > MCE_EXTENDED_BANK) {
 460                 banks = MCE_EXTENDED_BANK;
 461                 printk(KERN_INFO "MCE: warning: using only %d banks\n",
 462                        MCE_EXTENDED_BANK);
 463         }
 464         /* Use accurate RIP reporting if available. */
 465         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 466                 rip_msr = MSR_IA32_MCG_EIP;
 467
 468         /* Log the machine checks left over from the previous reset.
 469            This also clears all registers */
 470         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 471
 472         set_in_cr4(X86_CR4_MCE);
 473
 474         if (cap & MCG_CTL_P)
 475                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 476
 477         for (i = 0; i < banks; i++) {
 478                 if (i < NR_SYSFS_BANKS)
 479                         wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 480                 else
 481                         wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
 482
 483                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 484         }
 485 }
 486
 487 /* Add per CPU specific workarounds here */
 488 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 489 {
 490         /* This should be disabled by the BIOS, but isn't always */
 491         if (c->x86_vendor == X86_VENDOR_AMD) {
 492                 if(c->x86 == 15)
 493                         /* disable GART TBL walk error reporting, which trips off
 494                            incorrectly with the IOMMU & 3ware & Cerberus. */
 495                         clear_bit(10, &bank[4]);
 496                 if(c->x86 <= 17 && mce_bootlog < 0)
 497                         /* Lots of broken BIOS around that don't clear them
 498                            by default and leave crap in there. Don't log. */
 499                         mce_bootlog = 0;
 500         }
 501
 502 }
 503
 504 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 505 {
 506         switch (c->x86_vendor) {
 507         case X86_VENDOR_INTEL:
 508                 mce_intel_feature_init(c);
 509                 break;
 510         case X86_VENDOR_AMD:
 511                 mce_amd_feature_init(c);
 512                 break;
 513         default:
 514                 break;
 515         }
 516 }
 517
 518 /*
 519  * Called for each booted CPU to set up machine checks.
 520  * Must be called with preempt off.
 521  */
 522 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 523 {
 524         mce_cpu_quirks(c);
 525
 526         if (mce_dont_init ||
 527             !mce_available(c))
 528                 return;
 529
 530         mce_init(NULL);
 531         mce_cpu_features(c);
 532 }
 533
 534 /*
 535  * Character device to read and clear the MCE log.
 536  */
 537
 538 static DEFINE_SPINLOCK(mce_state_lock);
 539 static int open_count;  /* #times opened */
 540 static int open_exclu;  /* already open exclusive? */
 541
 542 static int mce_open(struct inode *inode, struct file *file)
 543 {
 544         lock_kernel();
 545         spin_lock(&mce_state_lock);
 546
 547         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 548                 spin_unlock(&mce_state_lock);
 549                 unlock_kernel();
 550                 return -EBUSY;
 551         }
 552
 553         if (file->f_flags & O_EXCL)
 554                 open_exclu = 1;
 555         open_count++;
 556
 557         spin_unlock(&mce_state_lock);
 558         unlock_kernel();
 559
 560         return nonseekable_open(inode, file);
 561 }
 562
 563 static int mce_release(struct inode *inode, struct file *file)
 564 {
 565         spin_lock(&mce_state_lock);
 566
 567         open_count--;
 568         open_exclu = 0;
 569
 570         spin_unlock(&mce_state_lock);
 571
 572         return 0;
 573 }
 574
 575 static void collect_tscs(void *data)
 576 {
 577         unsigned long *cpu_tsc = (unsigned long *)data;
 578
 579         rdtscll(cpu_tsc[smp_processor_id()]);
 580 }
 581
 582 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 583                         loff_t *off)
 584 {
 585         unsigned long *cpu_tsc;
 586         static DEFINE_MUTEX(mce_read_mutex);
 587         unsigned next;
 588         char __user *buf = ubuf;
 589         int i, err;
 590
 591         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 592         if (!cpu_tsc)
 593                 return -ENOMEM;
 594
 595         mutex_lock(&mce_read_mutex);
 596         next = rcu_dereference(mcelog.next);
 597
 598         /* Only supports full reads right now */
 599         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 600                 mutex_unlock(&mce_read_mutex);
 601                 kfree(cpu_tsc);
 602                 return -EINVAL;
 603         }
 604
 605         err = 0;
 606         for (i = 0; i < next; i++) {
 607                 unsigned long start = jiffies;
 608
 609                 while (!mcelog.entry[i].finished) {
 610                         if (time_after_eq(jiffies, start + 2)) {
 611                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 612                                 goto timeout;
 613                         }
 614                         cpu_relax();
 615                 }
 616                 smp_rmb();
 617                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 618                 buf += sizeof(struct mce);
 619  timeout:
 620                 ;
 621         }
 622
 623         memset(mcelog.entry, 0, next * sizeof(struct mce));
 624         mcelog.next = 0;
 625
 626         synchronize_sched();
 627
 628         /*
 629          * Collect entries that were still getting written before the
 630          * synchronize.
 631          */
 632         on_each_cpu(collect_tscs, cpu_tsc, 1);
 633         for (i = next; i < MCE_LOG_LEN; i++) {
 634                 if (mcelog.entry[i].finished &&
 635                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 636                         err |= copy_to_user(buf, mcelog.entry+i,
 637                                             sizeof(struct mce));
 638                         smp_rmb();
 639                         buf += sizeof(struct mce);
 640                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 641                 }
 642         }
 643         mutex_unlock(&mce_read_mutex);
 644         kfree(cpu_tsc);
 645         return err ? -EFAULT : buf - ubuf;
 646 }
 647
 648 static unsigned int mce_poll(struct file *file, poll_table *wait)
 649 {
 650         poll_wait(file, &mce_wait, wait);
 651         if (rcu_dereference(mcelog.next))
 652                 return POLLIN | POLLRDNORM;
 653         return 0;
 654 }
 655
 656 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 657 {
 658         int __user *p = (int __user *)arg;
 659
 660         if (!capable(CAP_SYS_ADMIN))
 661                 return -EPERM;
 662         switch (cmd) {
 663         case MCE_GET_RECORD_LEN:
 664                 return put_user(sizeof(struct mce), p);
 665         case MCE_GET_LOG_LEN:
 666                 return put_user(MCE_LOG_LEN, p);
 667         case MCE_GETCLEAR_FLAGS: {
 668                 unsigned flags;
 669
 670                 do {
 671                         flags = mcelog.flags;
 672                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 673                 return put_user(flags, p);
 674         }
 675         default:
 676                 return -ENOTTY;
 677         }
 678 }
 679
 680 static const struct file_operations mce_chrdev_ops = {
 681         .open = mce_open,
 682         .release = mce_release,
 683         .read = mce_read,
 684         .poll = mce_poll,
 685         .unlocked_ioctl = mce_ioctl,
 686 };
 687
 688 static struct miscdevice mce_log_device = {
 689         MISC_MCELOG_MINOR,
 690         "mcelog",
 691         &mce_chrdev_ops,
 692 };
 693
 694 /*
 695  * Old style boot options parsing. Only for compatibility.
 696  */
 697 static int __init mcheck_disable(char *str)
 698 {
 699         mce_dont_init = 1;
 700         return 1;
 701 }
 702
 703 /* mce=off disables machine check. Note you can re-enable it later
 704    using sysfs.
 705    mce=TOLERANCELEVEL (number, see above)
 706    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 707    mce=nobootlog Don't log MCEs from before booting. */
 708 static int __init mcheck_enable(char *str)
 709 {
 710         if (!strcmp(str, "off"))
 711                 mce_dont_init = 1;
 712         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 713                 mce_bootlog = str[0] == 'b';
 714         else if (isdigit(str[0]))
 715                 get_option(&str, &tolerant);
 716         else
 717                 printk("mce= argument %s ignored. Please use /sys", str);
 718         return 1;
 719 }
 720
 721 __setup("nomce", mcheck_disable);
 722 __setup("mce=", mcheck_enable);
 723
 724 /*
 725  * Sysfs support
 726  */
 727
 728 /*
 729  * Disable machine checks on suspend and shutdown. We can't really handle
 730  * them later.
 731  */
 732 static int mce_disable(void)
 733 {
 734         int i;
 735
 736         for (i = 0; i < banks; i++)
 737                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 738         return 0;
 739 }
 740
 741 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 742 {
 743         return mce_disable();
 744 }
 745
 746 static int mce_shutdown(struct sys_device *dev)
 747 {
 748         return mce_disable();
 749 }
 750
 751 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 752    Only one CPU is active at this time, the others get readded later using
 753    CPU hotplug. */
 754 static int mce_resume(struct sys_device *dev)
 755 {
 756         mce_init(NULL);
 757         mce_cpu_features(&current_cpu_data);
 758         return 0;
 759 }
 760
 761 /* Reinit MCEs after user configuration changes */
 762 static void mce_restart(void)
 763 {
 764         if (next_interval)
 765                 cancel_delayed_work(&mcheck_work);
 766         /* Timer race is harmless here */
 767         on_each_cpu(mce_init, NULL, 1);
 768         next_interval = check_interval * HZ;
 769         if (next_interval)
 770                 schedule_delayed_work(&mcheck_work,
 771                                       round_jiffies_relative(next_interval));
 772 }
 773
 774 static struct sysdev_class mce_sysclass = {
 775         .suspend = mce_suspend,
 776         .shutdown = mce_shutdown,
 777         .resume = mce_resume,
 778         .name = "machinecheck",
 779 };
 780
 781 DEFINE_PER_CPU(struct sys_device, device_mce);
 782 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 783
 784 /* Why are there no generic functions for this? */
 785 #define ACCESSOR(name, var, start) \
 786         static ssize_t show_ ## name(struct sys_device *s,              \
 787                                      struct sysdev_attribute *attr,     \
 788                                      char *buf) {                       \
 789                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 790         }                                                               \
 791         static ssize_t set_ ## name(struct sys_device *s,               \
 792                                     struct sysdev_attribute *attr,      \
 793                                     const char *buf, size_t siz) {      \
 794                 char *end;                                              \
 795                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 796                 if (end == buf) return -EINVAL;                         \
 797                 var = new;                                              \
 798                 start;                                                  \
 799                 return end-buf;                                         \
 800         }                                                               \
 801         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 802
 803 /*
 804  * TBD should generate these dynamically based on number of available banks.
 805  * Have only 6 contol banks in /sysfs until then.
 806  */
 807 ACCESSOR(bank0ctl,bank[0],mce_restart())
 808 ACCESSOR(bank1ctl,bank[1],mce_restart())
 809 ACCESSOR(bank2ctl,bank[2],mce_restart())
 810 ACCESSOR(bank3ctl,bank[3],mce_restart())
 811 ACCESSOR(bank4ctl,bank[4],mce_restart())
 812 ACCESSOR(bank5ctl,bank[5],mce_restart())
 813
 814 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 815                                 char *buf)
 816 {
 817         strcpy(buf, trigger);
 818         strcat(buf, "\n");
 819         return strlen(trigger) + 1;
 820 }
 821
 822 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 823                                 const char *buf,size_t siz)
 824 {
 825         char *p;
 826         int len;
 827         strncpy(trigger, buf, sizeof(trigger));
 828         trigger[sizeof(trigger)-1] = 0;
 829         len = strlen(trigger);
 830         p = strchr(trigger, '\n');
 831         if (*p) *p = 0;
 832         return len;
 833 }
 834
 835 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 836 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 837 ACCESSOR(check_interval,check_interval,mce_restart())
 838 static struct sysdev_attribute *mce_attributes[] = {
 839         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 840         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 841         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 842         NULL
 843 };
 844
 845 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
 846
 847 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 848 static __cpuinit int mce_create_device(unsigned int cpu)
 849 {
 850         int err;
 851         int i;
 852
 853         if (!mce_available(&boot_cpu_data))
 854                 return -EIO;
 855
 856         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 857         per_cpu(device_mce,cpu).id = cpu;
 858         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 859
 860         err = sysdev_register(&per_cpu(device_mce,cpu));
 861         if (err)
 862                 return err;
 863
 864         for (i = 0; mce_attributes[i]; i++) {
 865                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
 866                                          mce_attributes[i]);
 867                 if (err)
 868                         goto error;
 869         }
 870         cpu_set(cpu, mce_device_initialized);
 871
 872         return 0;
 873 error:
 874         while (i--) {
 875                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 876                                    mce_attributes[i]);
 877         }
 878         sysdev_unregister(&per_cpu(device_mce,cpu));
 879
 880         return err;
 881 }
 882
 883 static __cpuinit void mce_remove_device(unsigned int cpu)
 884 {
 885         int i;
 886
 887         if (!cpu_isset(cpu, mce_device_initialized))
 888                 return;
 889
 890         for (i = 0; mce_attributes[i]; i++)
 891                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 892                         mce_attributes[i]);
 893         sysdev_unregister(&per_cpu(device_mce,cpu));
 894         cpu_clear(cpu, mce_device_initialized);
 895 }
 896
 897 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 898 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 899                                       unsigned long action, void *hcpu)
 900 {
 901         unsigned int cpu = (unsigned long)hcpu;
 902
 903         switch (action) {
 904         case CPU_ONLINE:
 905         case CPU_ONLINE_FROZEN:
 906                 mce_create_device(cpu);
 907                 if (threshold_cpu_callback)
 908                         threshold_cpu_callback(action, cpu);
 909                 break;
 910         case CPU_DEAD:
 911         case CPU_DEAD_FROZEN:
 912                 if (threshold_cpu_callback)
 913                         threshold_cpu_callback(action, cpu);
 914                 mce_remove_device(cpu);
 915                 break;
 916         }
 917         return NOTIFY_OK;
 918 }
 919
 920 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 921         .notifier_call = mce_cpu_callback,
 922 };
 923
 924 static __init int mce_init_device(void)
 925 {
 926         int err;
 927         int i = 0;
 928
 929         if (!mce_available(&boot_cpu_data))
 930                 return -EIO;
 931         err = sysdev_class_register(&mce_sysclass);
 932         if (err)
 933                 return err;
 934
 935         for_each_online_cpu(i) {
 936                 err = mce_create_device(i);
 937                 if (err)
 938                         return err;
 939         }
 940
 941         register_hotcpu_notifier(&mce_cpu_notifier);
 942         misc_register(&mce_log_device);
 943         return err;
 944 }
 945
 946 device_initcall(mce_init_device);