sparc64: Measure receiver forward progress to avoid send mondo timeout

author Jane Chu <jane.chu@oracle.com>

Tue, 11 Jul 2017 18:00:54 +0000 (12:00 -0600)

committer David S. Miller <davem@davemloft.net>

Fri, 14 Jul 2017 18:18:02 +0000 (11:18 -0700)
author Jane Chu <jane.chu@oracle.com>
Tue, 11 Jul 2017 18:00:54 +0000 (12:00 -0600)
committer David S. Miller <davem@davemloft.net>
Fri, 14 Jul 2017 18:18:02 +0000 (11:18 -0700)
diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h

index ec9c04de3664910d81b7a55bbb09084d5e235d39..ff05992dae7a352597bf99fcd2d83500ad0c2995 100644 (file)
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS];
  void init_cur_cpu_trap(struct thread_info *);
  void setup_tba(void);
  extern int ncpus_probed;
+extern u64 cpu_mondo_counter[NR_CPUS];
  
  unsigned long real_hard_smp_processor_id(void);
  
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c

index fdf31040a7dc5cc460de6d60c9724a60933b38dc..3218bc43302e1cbbfc48420d868f5148a8d61085 100644 (file)
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -622,22 +622,48 @@ retry:
         }
  }
  
-/* Multi-cpu list version.  */
+#define        CPU_MONDO_COUNTER(cpuid)        (cpu_mondo_counter[cpuid])
+#define        MONDO_USEC_WAIT_MIN             2
+#define        MONDO_USEC_WAIT_MAX             100
+#define        MONDO_RETRY_LIMIT               500000
+
+/* Multi-cpu list version.
+ *
+ * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
+ * Sometimes not all cpus receive the mondo, requiring us to re-send
+ * the mondo until all cpus have received, or cpus are truly stuck
+ * unable to receive mondo, and we timeout.
+ * Occasionally a target cpu strand is borrowed briefly by hypervisor to
+ * perform guest service, such as PCIe error handling. Consider the
+ * service time, 1 second overall wait is reasonable for 1 cpu.
+ * Here two in-between mondo check wait time are defined: 2 usec for
+ * single cpu quick turn around and up to 100usec for large cpu count.
+ * Deliver mondo to large number of cpus could take longer, we adjusts
+ * the retry count as long as target cpus are making forward progress.
+ */
  static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  {
-       int retries, this_cpu, prev_sent, i, saw_cpu_error;
+       int this_cpu, tot_cpus, prev_sent, i, rem;
+       int usec_wait, retries, tot_retries;
+       u16 first_cpu = 0xffff;
+       unsigned long xc_rcvd = 0;
         unsigned long status;
+       int ecpuerror_id = 0;
+       int enocpu_id = 0;
         u16 *cpu_list;
+       u16 cpu;
  
         this_cpu = smp_processor_id();
-
         cpu_list = __va(tb->cpu_list_pa);
-
-       saw_cpu_error = 0;
-       retries = 0;
+       usec_wait = cnt * MONDO_USEC_WAIT_MIN;
+       if (usec_wait > MONDO_USEC_WAIT_MAX)
+               usec_wait = MONDO_USEC_WAIT_MAX;
+       retries = tot_retries = 0;
+       tot_cpus = cnt;
         prev_sent = 0;
+
         do {
-               int forward_progress, n_sent;
+               int n_sent, mondo_delivered, target_cpu_busy;
  
                 status = sun4v_cpu_mondo_send(cnt,
                                               tb->cpu_list_pa,
@@ -645,94 +671,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  
                 /* HV_EOK means all cpus received the xcall, we're done.  */
                 if (likely(status == HV_EOK))
-                       break;
+                       goto xcall_done;
+
+               /* If not these non-fatal errors, panic */
+               if (unlikely((status != HV_EWOULDBLOCK) &&
+                       (status != HV_ECPUERROR) &&
+                       (status != HV_ENOCPU)))
+                       goto fatal_errors;
  
                 /* First, see if we made any forward progress.
+                *
+                * Go through the cpu_list, count the target cpus that have
+                * received our mondo (n_sent), and those that did not (rem).
+                * Re-pack cpu_list with the cpus remain to be retried in the
+                * front - this simplifies tracking the truly stalled cpus.
                  *
                  * The hypervisor indicates successful sends by setting
                  * cpu list entries to the value 0xffff.
+                *
+                * EWOULDBLOCK means some target cpus did not receive the
+                * mondo and retry usually helps.
+                *
+                * ECPUERROR means at least one target cpu is in error state,
+                * it's usually safe to skip the faulty cpu and retry.
+                *
+                * ENOCPU means one of the target cpu doesn't belong to the
+                * domain, perhaps offlined which is unexpected, but not
+                * fatal and it's okay to skip the offlined cpu.
                  */
+               rem = 0;
                 n_sent = 0;
                 for (i = 0; i < cnt; i++) {
-                       if (likely(cpu_list[i] == 0xffff))
+                       cpu = cpu_list[i];
+                       if (likely(cpu == 0xffff)) {
                                 n_sent++;
+                       } else if ((status == HV_ECPUERROR) &&
+                               (sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
+                               ecpuerror_id = cpu + 1;
+                       } else if (status == HV_ENOCPU && !cpu_online(cpu)) {
+                               enocpu_id = cpu + 1;
+                       } else {
+                               cpu_list[rem++] = cpu;
+                       }
                 }
  
-               forward_progress = 0;
-               if (n_sent > prev_sent)
-                       forward_progress = 1;
+               /* No cpu remained, we're done. */
+               if (rem == 0)
+                       break;
  
-               prev_sent = n_sent;
+               /* Otherwise, update the cpu count for retry. */
+               cnt = rem;
  
-               /* If we get a HV_ECPUERROR, then one or more of the cpus
-                * in the list are in error state.  Use the cpu_state()
-                * hypervisor call to find out which cpus are in error state.
+               /* Record the overall number of mondos received by the
+                * first of the remaining cpus.
                  */
-               if (unlikely(status == HV_ECPUERROR)) {
-                       for (i = 0; i < cnt; i++) {
-                               long err;
-                               u16 cpu;
+               if (first_cpu != cpu_list[0]) {
+                       first_cpu = cpu_list[0];
+                       xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
+               }
  
-                               cpu = cpu_list[i];
-                               if (cpu == 0xffff)
-                                       continue;
+               /* Was any mondo delivered successfully? */
+               mondo_delivered = (n_sent > prev_sent);
+               prev_sent = n_sent;
  
-                               err = sun4v_cpu_state(cpu);
-                               if (err == HV_CPU_STATE_ERROR) {
-                                       saw_cpu_error = (cpu + 1);
-                                       cpu_list[i] = 0xffff;
-                               }
-                       }
-               } else if (unlikely(status != HV_EWOULDBLOCK))
-                       goto fatal_mondo_error;
+               /* or, was any target cpu busy processing other mondos? */
+               target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
+               xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
  
-               /* Don't bother rewriting the CPU list, just leave the
-                * 0xffff and non-0xffff entries in there and the
-                * hypervisor will do the right thing.
-                *
-                * Only advance timeout state if we didn't make any
-                * forward progress.
+               /* Retry count is for no progress. If we're making progress,
+                * reset the retry count.
                  */
-               if (unlikely(!forward_progress)) {
-                       if (unlikely(++retries > 10000))
-                               goto fatal_mondo_timeout;
-
-                       /* Delay a little bit to let other cpus catch up
-                        * on their cpu mondo queue work.
-                        */
-                       udelay(2 * cnt);
+               if (likely(mondo_delivered || target_cpu_busy)) {
+                       tot_retries += retries;
+                       retries = 0;
+               } else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
+                       goto fatal_mondo_timeout;
                 }
-       } while (1);
  
-       if (unlikely(saw_cpu_error))
-               goto fatal_mondo_cpu_error;
+               /* Delay a little bit to let other cpus catch up on
+                * their cpu mondo queue work.
+                */
+               if (!mondo_delivered)
+                       udelay(usec_wait);
  
-       return;
+               retries++;
+       } while (1);
  
-fatal_mondo_cpu_error:
-       printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
-              "(including %d) were in error state\n",
-              this_cpu, saw_cpu_error - 1);
+xcall_done:
+       if (unlikely(ecpuerror_id > 0)) {
+               pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
+                      this_cpu, ecpuerror_id - 1);
+       } else if (unlikely(enocpu_id > 0)) {
+               pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
+                      this_cpu, enocpu_id - 1);
+       }
         return;
  
+fatal_errors:
+       /* fatal errors include bad alignment, etc */
+       pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
+              this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+       panic("Unexpected SUN4V mondo error %lu\n", status);
+
  fatal_mondo_timeout:
-       printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
-              " progress after %d retries.\n",
-              this_cpu, retries);
-       goto dump_cpu_list_and_out;
-
-fatal_mondo_error:
-       printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
-              this_cpu, status);
-       printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
-              "mondo_block_pa(%lx)\n",
-              this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
-
-dump_cpu_list_and_out:
-       printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
-       for (i = 0; i < cnt; i++)
-               printk("%u ", cpu_list[i]);
-       printk("]\n");
+       /* some cpus being non-responsive to the cpu mondo */
+       pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
+              this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
+       panic("SUN4V mondo timeout panic\n");
  }
  
  static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
diff --git a/arch/sparc/kernel/sun4v_ivec.S b/arch/sparc/kernel/sun4v_ivec.S

index 559bc5e9c199232d092ec425d705a016b639db9a..34631995859afb2c273f3087796ff550371bc634 100644 (file)
--- a/arch/sparc/kernel/sun4v_ivec.S
+++ b/arch/sparc/kernel/sun4v_ivec.S
@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
         ldxa    [%g0] ASI_SCRATCHPAD, %g4
         sub     %g4, TRAP_PER_CPU_FAULT_INFO, %g4
  
+       /* Get smp_processor_id() into %g3 */
+       sethi   %hi(trap_block), %g5
+       or      %g5, %lo(trap_block), %g5
+       sub     %g4, %g5, %g3
+       srlx    %g3, TRAP_BLOCK_SZ_SHIFT, %g3
+
+       /* Increment cpu_mondo_counter[smp_processor_id()] */
+       sethi   %hi(cpu_mondo_counter), %g5
+       or      %g5, %lo(cpu_mondo_counter), %g5
+       sllx    %g3, 3, %g3
+       add     %g5, %g3, %g5
+       ldx     [%g5], %g3
+       add     %g3, 1, %g3
+       stx     %g3, [%g5]
+
         /* Get CPU mondo queue base phys address into %g7.  */
         ldx     [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
  
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c

index 196ee5eb4d489b156d677f079f545e6ff792289d..ad31af1dd726575986c87bba514b345f291342ec 100644 (file)
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2733,6 +2733,7 @@ void do_getpsr(struct pt_regs *regs)
         }
  }
  
+u64 cpu_mondo_counter[NR_CPUS] = {0};
  struct trap_per_cpu trap_block[NR_CPUS];
  EXPORT_SYMBOL(trap_block);
author	Jane Chu <jane.chu@oracle.com>
	Tue, 11 Jul 2017 18:00:54 +0000 (12:00 -0600)
committer	David S. Miller <davem@davemloft.net>
	Fri, 14 Jul 2017 18:18:02 +0000 (11:18 -0700)
arch/sparc/include/asm/trap_block.h		patch \| blob \| history
arch/sparc/kernel/smp_64.c		patch \| blob \| history
arch/sparc/kernel/sun4v_ivec.S		patch \| blob \| history
arch/sparc/kernel/traps_64.c		patch \| blob \| history