drivers/infiniband/hw/hfi1/affinity.c

   1 /*
   2  * Copyright(c) 2015, 2016 Intel Corporation.
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of version 2 of the GNU General Public License as
  11  * published by the Free Software Foundation.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * BSD LICENSE
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  *  - Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  *  - Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in
  28  *    the documentation and/or other materials provided with the
  29  *    distribution.
  30  *  - Neither the name of Intel Corporation nor the names of its
  31  *    contributors may be used to endorse or promote products derived
  32  *    from this software without specific prior written permission.
  33  *
  34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45  *
  46  */
  47 #include <linux/topology.h>
  48 #include <linux/cpumask.h>
  49 #include <linux/module.h>
  50
  51 #include "hfi.h"
  52 #include "affinity.h"
  53 #include "sdma.h"
  54 #include "trace.h"
  55
  56 /* Name of IRQ types, indexed by enum irq_type */
  57 static const char * const irq_type_names[] = {
  58         "SDMA",
  59         "RCVCTXT",
  60         "GENERAL",
  61         "OTHER",
  62 };
  63
  64 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  65 {
  66         cpumask_clear(&set->mask);
  67         cpumask_clear(&set->used);
  68         set->gen = 0;
  69 }
  70
  71 /* Initialize non-HT cpu cores mask */
  72 int init_real_cpu_mask(struct hfi1_devdata *dd)
  73 {
  74         struct hfi1_affinity *info;
  75         int possible, curr_cpu, i, ht;
  76
  77         info = kzalloc(sizeof(*info), GFP_KERNEL);
  78         if (!info)
  79                 return -ENOMEM;
  80
  81         cpumask_clear(&info->real_cpu_mask);
  82
  83         /* Start with cpu online mask as the real cpu mask */
  84         cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
  85
  86         /*
  87          * Remove HT cores from the real cpu mask.  Do this in two steps below.
  88          */
  89         possible = cpumask_weight(&info->real_cpu_mask);
  90         ht = cpumask_weight(topology_sibling_cpumask(
  91                                         cpumask_first(&info->real_cpu_mask)));
  92         /*
  93          * Step 1.  Skip over the first N HT siblings and use them as the
  94          * "real" cores.  Assumes that HT cores are not enumerated in
  95          * succession (except in the single core case).
  96          */
  97         curr_cpu = cpumask_first(&info->real_cpu_mask);
  98         for (i = 0; i < possible / ht; i++)
  99                 curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
 100         /*
 101          * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 102          * skip any gaps.
 103          */
 104         for (; i < possible; i++) {
 105                 cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
 106                 curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
 107         }
 108
 109         dd->affinity = info;
 110         return 0;
 111 }
 112
 113 /*
 114  * Interrupt affinity.
 115  *
 116  * non-rcv avail gets a default mask that
 117  * starts as possible cpus with threads reset
 118  * and each rcv avail reset.
 119  *
 120  * rcv avail gets node relative 1 wrapping back
 121  * to the node relative 1 as necessary.
 122  *
 123  */
 124 void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 125 {
 126         int node = pcibus_to_node(dd->pcidev->bus);
 127         struct hfi1_affinity *info = dd->affinity;
 128         const struct cpumask *local_mask;
 129         int curr_cpu, possible, i;
 130
 131         if (node < 0)
 132                 node = numa_node_id();
 133         dd->node = node;
 134
 135         spin_lock_init(&info->lock);
 136
 137         init_cpu_mask_set(&info->def_intr);
 138         init_cpu_mask_set(&info->rcv_intr);
 139         init_cpu_mask_set(&info->proc);
 140
 141         local_mask = cpumask_of_node(dd->node);
 142         if (cpumask_first(local_mask) >= nr_cpu_ids)
 143                 local_mask = topology_core_cpumask(0);
 144         /* Use the "real" cpu mask of this node as the default */
 145         cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
 146
 147         /*  fill in the receive list */
 148         possible = cpumask_weight(&info->def_intr.mask);
 149         curr_cpu = cpumask_first(&info->def_intr.mask);
 150         if (possible == 1) {
 151                 /*  only one CPU, everyone will use it */
 152                 cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
 153         } else {
 154                 /*
 155                  * Retain the first CPU in the default list for the control
 156                  * context.
 157                  */
 158                 curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
 159                 /*
 160                  * Remove the remaining kernel receive queues from
 161                  * the default list and add them to the receive list.
 162                  */
 163                 for (i = 0; i < dd->n_krcv_queues - 1; i++) {
 164                         cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
 165                         cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
 166                         curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
 167                         if (curr_cpu >= nr_cpu_ids)
 168                                 break;
 169                 }
 170         }
 171
 172         cpumask_copy(&info->proc.mask, cpu_online_mask);
 173 }
 174
 175 void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
 176 {
 177         kfree(dd->affinity);
 178 }
 179
 180 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 181 {
 182         int ret;
 183         cpumask_var_t diff;
 184         struct cpu_mask_set *set;
 185         struct sdma_engine *sde = NULL;
 186         struct hfi1_ctxtdata *rcd = NULL;
 187         char extra[64];
 188         int cpu = -1;
 189
 190         extra[0] = '\0';
 191         cpumask_clear(&msix->mask);
 192
 193         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 194         if (!ret)
 195                 return -ENOMEM;
 196
 197         switch (msix->type) {
 198         case IRQ_SDMA:
 199                 sde = (struct sdma_engine *)msix->arg;
 200                 scnprintf(extra, 64, "engine %u", sde->this_idx);
 201                 /* fall through */
 202         case IRQ_GENERAL:
 203                 set = &dd->affinity->def_intr;
 204                 break;
 205         case IRQ_RCVCTXT:
 206                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 207                 if (rcd->ctxt == HFI1_CTRL_CTXT) {
 208                         set = &dd->affinity->def_intr;
 209                         cpu = cpumask_first(&set->mask);
 210                 } else {
 211                         set = &dd->affinity->rcv_intr;
 212                 }
 213                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 214                 break;
 215         default:
 216                 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 217                 return -EINVAL;
 218         }
 219
 220         /*
 221          * The control receive context is placed on a particular CPU, which
 222          * is set above.  Skip accounting for it.  Everything else finds its
 223          * CPU here.
 224          */
 225         if (cpu == -1) {
 226                 spin_lock(&dd->affinity->lock);
 227                 if (cpumask_equal(&set->mask, &set->used)) {
 228                         /*
 229                          * We've used up all the CPUs, bump up the generation
 230                          * and reset the 'used' map
 231                          */
 232                         set->gen++;
 233                         cpumask_clear(&set->used);
 234                 }
 235                 cpumask_andnot(diff, &set->mask, &set->used);
 236                 cpu = cpumask_first(diff);
 237                 cpumask_set_cpu(cpu, &set->used);
 238                 spin_unlock(&dd->affinity->lock);
 239         }
 240
 241         switch (msix->type) {
 242         case IRQ_SDMA:
 243                 sde->cpu = cpu;
 244                 break;
 245         case IRQ_GENERAL:
 246         case IRQ_RCVCTXT:
 247         case IRQ_OTHER:
 248                 break;
 249         }
 250
 251         cpumask_set_cpu(cpu, &msix->mask);
 252         dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
 253                     msix->msix.vector, irq_type_names[msix->type],
 254                     extra, cpu);
 255         irq_set_affinity_hint(msix->msix.vector, &msix->mask);
 256
 257         free_cpumask_var(diff);
 258         return 0;
 259 }
 260
 261 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 262                            struct hfi1_msix_entry *msix)
 263 {
 264         struct cpu_mask_set *set = NULL;
 265         struct hfi1_ctxtdata *rcd;
 266
 267         switch (msix->type) {
 268         case IRQ_SDMA:
 269         case IRQ_GENERAL:
 270                 set = &dd->affinity->def_intr;
 271                 break;
 272         case IRQ_RCVCTXT:
 273                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 274                 /* only do accounting for non control contexts */
 275                 if (rcd->ctxt != HFI1_CTRL_CTXT)
 276                         set = &dd->affinity->rcv_intr;
 277                 break;
 278         default:
 279                 return;
 280         }
 281
 282         if (set) {
 283                 spin_lock(&dd->affinity->lock);
 284                 cpumask_andnot(&set->used, &set->used, &msix->mask);
 285                 if (cpumask_empty(&set->used) && set->gen) {
 286                         set->gen--;
 287                         cpumask_copy(&set->used, &set->mask);
 288                 }
 289                 spin_unlock(&dd->affinity->lock);
 290         }
 291
 292         irq_set_affinity_hint(msix->msix.vector, NULL);
 293         cpumask_clear(&msix->mask);
 294 }
 295
 296 int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 297 {
 298         int cpu = -1, ret;
 299         cpumask_var_t diff, mask, intrs;
 300         const struct cpumask *node_mask,
 301                 *proc_mask = tsk_cpus_allowed(current);
 302         struct cpu_mask_set *set = &dd->affinity->proc;
 303
 304         /*
 305          * check whether process/context affinity has already
 306          * been set
 307          */
 308         if (cpumask_weight(proc_mask) == 1) {
 309                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 310                           current->pid, current->comm,
 311                           cpumask_pr_args(proc_mask));
 312                 /*
 313                  * Mark the pre-set CPU as used. This is atomic so we don't
 314                  * need the lock
 315                  */
 316                 cpu = cpumask_first(proc_mask);
 317                 cpumask_set_cpu(cpu, &set->used);
 318                 goto done;
 319         } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
 320                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 321                           current->pid, current->comm,
 322                           cpumask_pr_args(proc_mask));
 323                 goto done;
 324         }
 325
 326         /*
 327          * The process does not have a preset CPU affinity so find one to
 328          * recommend. We prefer CPUs on the same NUMA as the device.
 329          */
 330
 331         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
 332         if (!ret)
 333                 goto done;
 334         ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
 335         if (!ret)
 336                 goto free_diff;
 337         ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
 338         if (!ret)
 339                 goto free_mask;
 340
 341         spin_lock(&dd->affinity->lock);
 342         /*
 343          * If we've used all available CPUs, clear the mask and start
 344          * overloading.
 345          */
 346         if (cpumask_equal(&set->mask, &set->used)) {
 347                 set->gen++;
 348                 cpumask_clear(&set->used);
 349         }
 350
 351         /* CPUs used by interrupt handlers */
 352         cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
 353                              &dd->affinity->def_intr.mask :
 354                              &dd->affinity->def_intr.used));
 355         cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
 356                                   &dd->affinity->rcv_intr.mask :
 357                                   &dd->affinity->rcv_intr.used));
 358         hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 359                   cpumask_pr_args(intrs));
 360
 361         /*
 362          * If we don't have a NUMA node requested, preference is towards
 363          * device NUMA node
 364          */
 365         if (node == -1)
 366                 node = dd->node;
 367         node_mask = cpumask_of_node(node);
 368         hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node,
 369                   cpumask_pr_args(node_mask));
 370
 371         /* diff will hold all unused cpus */
 372         cpumask_andnot(diff, &set->mask, &set->used);
 373         hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff));
 374
 375         /* get cpumask of available CPUs on preferred NUMA */
 376         cpumask_and(mask, diff, node_mask);
 377         hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask));
 378
 379         /*
 380          * At first, we don't want to place processes on the same
 381          * CPUs as interrupt handlers.
 382          */
 383         cpumask_andnot(diff, mask, intrs);
 384         if (!cpumask_empty(diff))
 385                 cpumask_copy(mask, diff);
 386
 387         /*
 388          * if we don't have a cpu on the preferred NUMA, get
 389          * the list of the remaining available CPUs
 390          */
 391         if (cpumask_empty(mask)) {
 392                 cpumask_andnot(diff, &set->mask, &set->used);
 393                 cpumask_andnot(mask, diff, node_mask);
 394         }
 395         hfi1_cdbg(PROC, "possible CPUs for process %*pbl",
 396                   cpumask_pr_args(mask));
 397
 398         cpu = cpumask_first(mask);
 399         if (cpu >= nr_cpu_ids) /* empty */
 400                 cpu = -1;
 401         else
 402                 cpumask_set_cpu(cpu, &set->used);
 403         spin_unlock(&dd->affinity->lock);
 404
 405         free_cpumask_var(intrs);
 406 free_mask:
 407         free_cpumask_var(mask);
 408 free_diff:
 409         free_cpumask_var(diff);
 410 done:
 411         return cpu;
 412 }
 413
 414 void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
 415 {
 416         struct cpu_mask_set *set = &dd->affinity->proc;
 417
 418         if (cpu < 0)
 419                 return;
 420         spin_lock(&dd->affinity->lock);
 421         cpumask_clear_cpu(cpu, &set->used);
 422         if (cpumask_empty(&set->used) && set->gen) {
 423                 set->gen--;
 424                 cpumask_copy(&set->used, &set->mask);
 425         }
 426         spin_unlock(&dd->affinity->lock);
 427 }
 428