]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/infiniband/hw/hfi1/user_exp_rcv.c
Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[karo-tx-linux.git] / drivers / infiniband / hw / hfi1 / user_exp_rcv.c
1 /*
2  * Copyright(c) 2015-2017 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 #include <asm/page.h>
48 #include <linux/string.h>
49
50 #include "user_exp_rcv.h"
51 #include "trace.h"
52 #include "mmu_rb.h"
53
54 struct tid_group {
55         struct list_head list;
56         u32 base;
57         u8 size;
58         u8 used;
59         u8 map;
60 };
61
62 struct tid_rb_node {
63         struct mmu_rb_node mmu;
64         unsigned long phys;
65         struct tid_group *grp;
66         u32 rcventry;
67         dma_addr_t dma_addr;
68         bool freed;
69         unsigned npages;
70         struct page *pages[0];
71 };
72
73 struct tid_pageset {
74         u16 idx;
75         u16 count;
76 };
77
78 #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
79
80 #define num_user_pages(vaddr, len)                                     \
81         (1 + (((((unsigned long)(vaddr) +                              \
82                  (unsigned long)(len) - 1) & PAGE_MASK) -              \
83                ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
84
85 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
86                             struct exp_tid_set *set,
87                             struct hfi1_filedata *fd);
88 static u32 find_phys_blocks(struct page **pages, unsigned npages,
89                             struct tid_pageset *list);
90 static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr,
91                               u32 rcventry, struct tid_group *grp,
92                               struct page **pages, unsigned npages);
93 static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
94 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
95                                     struct tid_rb_node *tnode);
96 static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
97 static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
98 static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr,
99                             struct tid_group *grp, struct tid_pageset *sets,
100                             unsigned start, u16 count, struct page **pages,
101                             u32 *tidlist, unsigned *tididx, unsigned *pmapped);
102 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
103                               struct tid_group **grp);
104 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
105
106 static struct mmu_rb_ops tid_rb_ops = {
107         .insert = tid_rb_insert,
108         .remove = tid_rb_remove,
109         .invalidate = tid_rb_invalidate
110 };
111
112 static inline u32 rcventry2tidinfo(u32 rcventry)
113 {
114         u32 pair = rcventry & ~0x1;
115
116         return EXP_TID_SET(IDX, pair >> 1) |
117                 EXP_TID_SET(CTRL, 1 << (rcventry - pair));
118 }
119
120 static inline void exp_tid_group_init(struct exp_tid_set *set)
121 {
122         INIT_LIST_HEAD(&set->list);
123         set->count = 0;
124 }
125
126 static inline void tid_group_remove(struct tid_group *grp,
127                                     struct exp_tid_set *set)
128 {
129         list_del_init(&grp->list);
130         set->count--;
131 }
132
133 static inline void tid_group_add_tail(struct tid_group *grp,
134                                       struct exp_tid_set *set)
135 {
136         list_add_tail(&grp->list, &set->list);
137         set->count++;
138 }
139
140 static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
141 {
142         struct tid_group *grp =
143                 list_first_entry(&set->list, struct tid_group, list);
144         list_del_init(&grp->list);
145         set->count--;
146         return grp;
147 }
148
149 static inline void tid_group_move(struct tid_group *group,
150                                   struct exp_tid_set *s1,
151                                   struct exp_tid_set *s2)
152 {
153         tid_group_remove(group, s1);
154         tid_group_add_tail(group, s2);
155 }
156
157 int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd)
158 {
159         struct hfi1_ctxtdata *uctxt = fd->uctxt;
160         struct hfi1_devdata *dd = fd->dd;
161         u32 tidbase;
162         u32 i;
163         struct tid_group *grp, *gptr;
164
165         exp_tid_group_init(&uctxt->tid_group_list);
166         exp_tid_group_init(&uctxt->tid_used_list);
167         exp_tid_group_init(&uctxt->tid_full_list);
168
169         tidbase = uctxt->expected_base;
170         for (i = 0; i < uctxt->expected_count /
171                      dd->rcv_entries.group_size; i++) {
172                 grp = kzalloc(sizeof(*grp), GFP_KERNEL);
173                 if (!grp)
174                         goto grp_failed;
175
176                 grp->size = dd->rcv_entries.group_size;
177                 grp->base = tidbase;
178                 tid_group_add_tail(grp, &uctxt->tid_group_list);
179                 tidbase += dd->rcv_entries.group_size;
180         }
181
182         return 0;
183
184 grp_failed:
185         list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
186                                  list) {
187                 list_del_init(&grp->list);
188                 kfree(grp);
189         }
190
191         return -ENOMEM;
192 }
193
194 /*
195  * Initialize context and file private data needed for Expected
196  * receive caching. This needs to be done after the context has
197  * been configured with the eager/expected RcvEntry counts.
198  */
199 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd)
200 {
201         struct hfi1_ctxtdata *uctxt = fd->uctxt;
202         struct hfi1_devdata *dd = uctxt->dd;
203         int ret = 0;
204
205         spin_lock_init(&fd->tid_lock);
206         spin_lock_init(&fd->invalid_lock);
207
208         fd->entry_to_rb = kcalloc(uctxt->expected_count,
209                                   sizeof(struct rb_node *),
210                                   GFP_KERNEL);
211         if (!fd->entry_to_rb)
212                 return -ENOMEM;
213
214         if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
215                 fd->invalid_tid_idx = 0;
216                 fd->invalid_tids = kcalloc(uctxt->expected_count,
217                                            sizeof(*fd->invalid_tids),
218                                            GFP_KERNEL);
219                 if (!fd->invalid_tids) {
220                         kfree(fd->entry_to_rb);
221                         fd->entry_to_rb = NULL;
222                         return -ENOMEM;
223                 }
224
225                 /*
226                  * Register MMU notifier callbacks. If the registration
227                  * fails, continue without TID caching for this context.
228                  */
229                 ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
230                                            dd->pport->hfi1_wq,
231                                            &fd->handler);
232                 if (ret) {
233                         dd_dev_info(dd,
234                                     "Failed MMU notifier registration %d\n",
235                                     ret);
236                         ret = 0;
237                 }
238         }
239
240         /*
241          * PSM does not have a good way to separate, count, and
242          * effectively enforce a limit on RcvArray entries used by
243          * subctxts (when context sharing is used) when TID caching
244          * is enabled. To help with that, we calculate a per-process
245          * RcvArray entry share and enforce that.
246          * If TID caching is not in use, PSM deals with usage on its
247          * own. In that case, we allow any subctxt to take all of the
248          * entries.
249          *
250          * Make sure that we set the tid counts only after successful
251          * init.
252          */
253         spin_lock(&fd->tid_lock);
254         if (uctxt->subctxt_cnt && fd->handler) {
255                 u16 remainder;
256
257                 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
258                 remainder = uctxt->expected_count % uctxt->subctxt_cnt;
259                 if (remainder && fd->subctxt < remainder)
260                         fd->tid_limit++;
261         } else {
262                 fd->tid_limit = uctxt->expected_count;
263         }
264         spin_unlock(&fd->tid_lock);
265
266         return ret;
267 }
268
269 void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt)
270 {
271         struct tid_group *grp, *gptr;
272
273         list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
274                                  list) {
275                 list_del_init(&grp->list);
276                 kfree(grp);
277         }
278         hfi1_clear_tids(uctxt);
279 }
280
281 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
282 {
283         struct hfi1_ctxtdata *uctxt = fd->uctxt;
284
285         /*
286          * The notifier would have been removed when the process'es mm
287          * was freed.
288          */
289         if (fd->handler) {
290                 hfi1_mmu_rb_unregister(fd->handler);
291         } else {
292                 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
293                         unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
294                 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
295                         unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
296         }
297
298         kfree(fd->invalid_tids);
299         fd->invalid_tids = NULL;
300
301         kfree(fd->entry_to_rb);
302         fd->entry_to_rb = NULL;
303 }
304
305 /*
306  * Write an "empty" RcvArray entry.
307  * This function exists so the TID registaration code can use it
308  * to write to unused/unneeded entries and still take advantage
309  * of the WC performance improvements. The HFI will ignore this
310  * write to the RcvArray entry.
311  */
312 static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
313 {
314         /*
315          * Doing the WC fill writes only makes sense if the device is
316          * present and the RcvArray has been mapped as WC memory.
317          */
318         if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
319                 writeq(0, dd->rcvarray_wc + (index * 8));
320 }
321
322 /*
323  * RcvArray entry allocation for Expected Receives is done by the
324  * following algorithm:
325  *
326  * The context keeps 3 lists of groups of RcvArray entries:
327  *   1. List of empty groups - tid_group_list
328  *      This list is created during user context creation and
329  *      contains elements which describe sets (of 8) of empty
330  *      RcvArray entries.
331  *   2. List of partially used groups - tid_used_list
332  *      This list contains sets of RcvArray entries which are
333  *      not completely used up. Another mapping request could
334  *      use some of all of the remaining entries.
335  *   3. List of full groups - tid_full_list
336  *      This is the list where sets that are completely used
337  *      up go.
338  *
339  * An attempt to optimize the usage of RcvArray entries is
340  * made by finding all sets of physically contiguous pages in a
341  * user's buffer.
342  * These physically contiguous sets are further split into
343  * sizes supported by the receive engine of the HFI. The
344  * resulting sets of pages are stored in struct tid_pageset,
345  * which describes the sets as:
346  *    * .count - number of pages in this set
347  *    * .idx - starting index into struct page ** array
348  *                    of this set
349  *
350  * From this point on, the algorithm deals with the page sets
351  * described above. The number of pagesets is divided by the
352  * RcvArray group size to produce the number of full groups
353  * needed.
354  *
355  * Groups from the 3 lists are manipulated using the following
356  * rules:
357  *   1. For each set of 8 pagesets, a complete group from
358  *      tid_group_list is taken, programmed, and moved to
359  *      the tid_full_list list.
360  *   2. For all remaining pagesets:
361  *      2.1 If the tid_used_list is empty and the tid_group_list
362  *          is empty, stop processing pageset and return only
363  *          what has been programmed up to this point.
364  *      2.2 If the tid_used_list is empty and the tid_group_list
365  *          is not empty, move a group from tid_group_list to
366  *          tid_used_list.
367  *      2.3 For each group is tid_used_group, program as much as
368  *          can fit into the group. If the group becomes fully
369  *          used, move it to tid_full_list.
370  */
371 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
372                             struct hfi1_tid_info *tinfo)
373 {
374         int ret = 0, need_group = 0, pinned;
375         struct hfi1_ctxtdata *uctxt = fd->uctxt;
376         struct hfi1_devdata *dd = uctxt->dd;
377         unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
378                 tididx = 0, mapped, mapped_pages = 0;
379         unsigned long vaddr = tinfo->vaddr;
380         struct page **pages = NULL;
381         u32 *tidlist = NULL;
382         struct tid_pageset *pagesets = NULL;
383
384         /* Get the number of pages the user buffer spans */
385         npages = num_user_pages(vaddr, tinfo->length);
386         if (!npages)
387                 return -EINVAL;
388
389         if (npages > uctxt->expected_count) {
390                 dd_dev_err(dd, "Expected buffer too big\n");
391                 return -EINVAL;
392         }
393
394         /* Verify that access is OK for the user buffer */
395         if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
396                        npages * PAGE_SIZE)) {
397                 dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
398                            (void *)vaddr, npages);
399                 return -EFAULT;
400         }
401
402         pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
403                            GFP_KERNEL);
404         if (!pagesets)
405                 return -ENOMEM;
406
407         /* Allocate the array of struct page pointers needed for pinning */
408         pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
409         if (!pages) {
410                 ret = -ENOMEM;
411                 goto bail;
412         }
413
414         /*
415          * Pin all the pages of the user buffer. If we can't pin all the
416          * pages, accept the amount pinned so far and program only that.
417          * User space knows how to deal with partially programmed buffers.
418          */
419         if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
420                 ret = -ENOMEM;
421                 goto bail;
422         }
423
424         pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
425         if (pinned <= 0) {
426                 ret = pinned;
427                 goto bail;
428         }
429         fd->tid_n_pinned += npages;
430
431         /* Find sets of physically contiguous pages */
432         npagesets = find_phys_blocks(pages, pinned, pagesets);
433
434         /*
435          * We don't need to access this under a lock since tid_used is per
436          * process and the same process cannot be in hfi1_user_exp_rcv_clear()
437          * and hfi1_user_exp_rcv_setup() at the same time.
438          */
439         spin_lock(&fd->tid_lock);
440         if (fd->tid_used + npagesets > fd->tid_limit)
441                 pageset_count = fd->tid_limit - fd->tid_used;
442         else
443                 pageset_count = npagesets;
444         spin_unlock(&fd->tid_lock);
445
446         if (!pageset_count)
447                 goto bail;
448
449         ngroups = pageset_count / dd->rcv_entries.group_size;
450         tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
451         if (!tidlist) {
452                 ret = -ENOMEM;
453                 goto nomem;
454         }
455
456         tididx = 0;
457
458         /*
459          * From this point on, we are going to be using shared (between master
460          * and subcontexts) context resources. We need to take the lock.
461          */
462         mutex_lock(&uctxt->exp_lock);
463         /*
464          * The first step is to program the RcvArray entries which are complete
465          * groups.
466          */
467         while (ngroups && uctxt->tid_group_list.count) {
468                 struct tid_group *grp =
469                         tid_group_pop(&uctxt->tid_group_list);
470
471                 ret = program_rcvarray(fd, vaddr, grp, pagesets,
472                                        pageidx, dd->rcv_entries.group_size,
473                                        pages, tidlist, &tididx, &mapped);
474                 /*
475                  * If there was a failure to program the RcvArray
476                  * entries for the entire group, reset the grp fields
477                  * and add the grp back to the free group list.
478                  */
479                 if (ret <= 0) {
480                         tid_group_add_tail(grp, &uctxt->tid_group_list);
481                         hfi1_cdbg(TID,
482                                   "Failed to program RcvArray group %d", ret);
483                         goto unlock;
484                 }
485
486                 tid_group_add_tail(grp, &uctxt->tid_full_list);
487                 ngroups--;
488                 pageidx += ret;
489                 mapped_pages += mapped;
490         }
491
492         while (pageidx < pageset_count) {
493                 struct tid_group *grp, *ptr;
494                 /*
495                  * If we don't have any partially used tid groups, check
496                  * if we have empty groups. If so, take one from there and
497                  * put in the partially used list.
498                  */
499                 if (!uctxt->tid_used_list.count || need_group) {
500                         if (!uctxt->tid_group_list.count)
501                                 goto unlock;
502
503                         grp = tid_group_pop(&uctxt->tid_group_list);
504                         tid_group_add_tail(grp, &uctxt->tid_used_list);
505                         need_group = 0;
506                 }
507                 /*
508                  * There is an optimization opportunity here - instead of
509                  * fitting as many page sets as we can, check for a group
510                  * later on in the list that could fit all of them.
511                  */
512                 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
513                                          list) {
514                         unsigned use = min_t(unsigned, pageset_count - pageidx,
515                                              grp->size - grp->used);
516
517                         ret = program_rcvarray(fd, vaddr, grp, pagesets,
518                                                pageidx, use, pages, tidlist,
519                                                &tididx, &mapped);
520                         if (ret < 0) {
521                                 hfi1_cdbg(TID,
522                                           "Failed to program RcvArray entries %d",
523                                           ret);
524                                 ret = -EFAULT;
525                                 goto unlock;
526                         } else if (ret > 0) {
527                                 if (grp->used == grp->size)
528                                         tid_group_move(grp,
529                                                        &uctxt->tid_used_list,
530                                                        &uctxt->tid_full_list);
531                                 pageidx += ret;
532                                 mapped_pages += mapped;
533                                 need_group = 0;
534                                 /* Check if we are done so we break out early */
535                                 if (pageidx >= pageset_count)
536                                         break;
537                         } else if (WARN_ON(ret == 0)) {
538                                 /*
539                                  * If ret is 0, we did not program any entries
540                                  * into this group, which can only happen if
541                                  * we've screwed up the accounting somewhere.
542                                  * Warn and try to continue.
543                                  */
544                                 need_group = 1;
545                         }
546                 }
547         }
548 unlock:
549         mutex_unlock(&uctxt->exp_lock);
550 nomem:
551         hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
552                   mapped_pages, ret);
553         if (tididx) {
554                 spin_lock(&fd->tid_lock);
555                 fd->tid_used += tididx;
556                 spin_unlock(&fd->tid_lock);
557                 tinfo->tidcnt = tididx;
558                 tinfo->length = mapped_pages * PAGE_SIZE;
559
560                 if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
561                                  tidlist, sizeof(tidlist[0]) * tididx)) {
562                         /*
563                          * On failure to copy to the user level, we need to undo
564                          * everything done so far so we don't leak resources.
565                          */
566                         tinfo->tidlist = (unsigned long)&tidlist;
567                         hfi1_user_exp_rcv_clear(fd, tinfo);
568                         tinfo->tidlist = 0;
569                         ret = -EFAULT;
570                         goto bail;
571                 }
572         }
573
574         /*
575          * If not everything was mapped (due to insufficient RcvArray entries,
576          * for example), unpin all unmapped pages so we can pin them nex time.
577          */
578         if (mapped_pages != pinned) {
579                 hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
580                                         pinned - mapped_pages,
581                                         false);
582                 fd->tid_n_pinned -= pinned - mapped_pages;
583         }
584 bail:
585         kfree(pagesets);
586         kfree(pages);
587         kfree(tidlist);
588         return ret > 0 ? 0 : ret;
589 }
590
591 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
592                             struct hfi1_tid_info *tinfo)
593 {
594         int ret = 0;
595         struct hfi1_ctxtdata *uctxt = fd->uctxt;
596         u32 *tidinfo;
597         unsigned tididx;
598
599         if (unlikely(tinfo->tidcnt > fd->tid_used))
600                 return -EINVAL;
601
602         tidinfo = memdup_user((void __user *)(unsigned long)tinfo->tidlist,
603                               sizeof(tidinfo[0]) * tinfo->tidcnt);
604         if (IS_ERR(tidinfo))
605                 return PTR_ERR(tidinfo);
606
607         mutex_lock(&uctxt->exp_lock);
608         for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
609                 ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
610                 if (ret) {
611                         hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
612                                   ret);
613                         break;
614                 }
615         }
616         spin_lock(&fd->tid_lock);
617         fd->tid_used -= tididx;
618         spin_unlock(&fd->tid_lock);
619         tinfo->tidcnt = tididx;
620         mutex_unlock(&uctxt->exp_lock);
621
622         kfree(tidinfo);
623         return ret;
624 }
625
626 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
627                               struct hfi1_tid_info *tinfo)
628 {
629         struct hfi1_ctxtdata *uctxt = fd->uctxt;
630         unsigned long *ev = uctxt->dd->events +
631                 (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
632                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
633         u32 *array;
634         int ret = 0;
635
636         if (!fd->invalid_tids)
637                 return -EINVAL;
638
639         /*
640          * copy_to_user() can sleep, which will leave the invalid_lock
641          * locked and cause the MMU notifier to be blocked on the lock
642          * for a long time.
643          * Copy the data to a local buffer so we can release the lock.
644          */
645         array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
646         if (!array)
647                 return -EFAULT;
648
649         spin_lock(&fd->invalid_lock);
650         if (fd->invalid_tid_idx) {
651                 memcpy(array, fd->invalid_tids, sizeof(*array) *
652                        fd->invalid_tid_idx);
653                 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
654                        fd->invalid_tid_idx);
655                 tinfo->tidcnt = fd->invalid_tid_idx;
656                 fd->invalid_tid_idx = 0;
657                 /*
658                  * Reset the user flag while still holding the lock.
659                  * Otherwise, PSM can miss events.
660                  */
661                 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
662         } else {
663                 tinfo->tidcnt = 0;
664         }
665         spin_unlock(&fd->invalid_lock);
666
667         if (tinfo->tidcnt) {
668                 if (copy_to_user((void __user *)tinfo->tidlist,
669                                  array, sizeof(*array) * tinfo->tidcnt))
670                         ret = -EFAULT;
671         }
672         kfree(array);
673
674         return ret;
675 }
676
677 static u32 find_phys_blocks(struct page **pages, unsigned npages,
678                             struct tid_pageset *list)
679 {
680         unsigned pagecount, pageidx, setcount = 0, i;
681         unsigned long pfn, this_pfn;
682
683         if (!npages)
684                 return 0;
685
686         /*
687          * Look for sets of physically contiguous pages in the user buffer.
688          * This will allow us to optimize Expected RcvArray entry usage by
689          * using the bigger supported sizes.
690          */
691         pfn = page_to_pfn(pages[0]);
692         for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
693                 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
694
695                 /*
696                  * If the pfn's are not sequential, pages are not physically
697                  * contiguous.
698                  */
699                 if (this_pfn != ++pfn) {
700                         /*
701                          * At this point we have to loop over the set of
702                          * physically contiguous pages and break them down it
703                          * sizes supported by the HW.
704                          * There are two main constraints:
705                          *     1. The max buffer size is MAX_EXPECTED_BUFFER.
706                          *        If the total set size is bigger than that
707                          *        program only a MAX_EXPECTED_BUFFER chunk.
708                          *     2. The buffer size has to be a power of two. If
709                          *        it is not, round down to the closes power of
710                          *        2 and program that size.
711                          */
712                         while (pagecount) {
713                                 int maxpages = pagecount;
714                                 u32 bufsize = pagecount * PAGE_SIZE;
715
716                                 if (bufsize > MAX_EXPECTED_BUFFER)
717                                         maxpages =
718                                                 MAX_EXPECTED_BUFFER >>
719                                                 PAGE_SHIFT;
720                                 else if (!is_power_of_2(bufsize))
721                                         maxpages =
722                                                 rounddown_pow_of_two(bufsize) >>
723                                                 PAGE_SHIFT;
724
725                                 list[setcount].idx = pageidx;
726                                 list[setcount].count = maxpages;
727                                 pagecount -= maxpages;
728                                 pageidx += maxpages;
729                                 setcount++;
730                         }
731                         pageidx = i;
732                         pagecount = 1;
733                         pfn = this_pfn;
734                 } else {
735                         pagecount++;
736                 }
737         }
738         return setcount;
739 }
740
741 /**
742  * program_rcvarray() - program an RcvArray group with receive buffers
743  * @fd: filedata pointer
744  * @vaddr: starting user virtual address
745  * @grp: RcvArray group
746  * @sets: array of struct tid_pageset holding information on physically
747  *        contiguous chunks from the user buffer
748  * @start: starting index into sets array
749  * @count: number of struct tid_pageset's to program
750  * @pages: an array of struct page * for the user buffer
751  * @tidlist: the array of u32 elements when the information about the
752  *           programmed RcvArray entries is to be encoded.
753  * @tididx: starting offset into tidlist
754  * @pmapped: (output parameter) number of pages programmed into the RcvArray
755  *           entries.
756  *
757  * This function will program up to 'count' number of RcvArray entries from the
758  * group 'grp'. To make best use of write-combining writes, the function will
759  * perform writes to the unused RcvArray entries which will be ignored by the
760  * HW. Each RcvArray entry will be programmed with a physically contiguous
761  * buffer chunk from the user's virtual buffer.
762  *
763  * Return:
764  * -EINVAL if the requested count is larger than the size of the group,
765  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
766  * number of RcvArray entries programmed.
767  */
768 static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr,
769                             struct tid_group *grp,
770                             struct tid_pageset *sets,
771                             unsigned start, u16 count, struct page **pages,
772                             u32 *tidlist, unsigned *tididx, unsigned *pmapped)
773 {
774         struct hfi1_ctxtdata *uctxt = fd->uctxt;
775         struct hfi1_devdata *dd = uctxt->dd;
776         u16 idx;
777         u32 tidinfo = 0, rcventry, useidx = 0;
778         int mapped = 0;
779
780         /* Count should never be larger than the group size */
781         if (count > grp->size)
782                 return -EINVAL;
783
784         /* Find the first unused entry in the group */
785         for (idx = 0; idx < grp->size; idx++) {
786                 if (!(grp->map & (1 << idx))) {
787                         useidx = idx;
788                         break;
789                 }
790                 rcv_array_wc_fill(dd, grp->base + idx);
791         }
792
793         idx = 0;
794         while (idx < count) {
795                 u16 npages, pageidx, setidx = start + idx;
796                 int ret = 0;
797
798                 /*
799                  * If this entry in the group is used, move to the next one.
800                  * If we go past the end of the group, exit the loop.
801                  */
802                 if (useidx >= grp->size) {
803                         break;
804                 } else if (grp->map & (1 << useidx)) {
805                         rcv_array_wc_fill(dd, grp->base + useidx);
806                         useidx++;
807                         continue;
808                 }
809
810                 rcventry = grp->base + useidx;
811                 npages = sets[setidx].count;
812                 pageidx = sets[setidx].idx;
813
814                 ret = set_rcvarray_entry(fd, vaddr + (pageidx * PAGE_SIZE),
815                                          rcventry, grp, pages + pageidx,
816                                          npages);
817                 if (ret)
818                         return ret;
819                 mapped += npages;
820
821                 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
822                         EXP_TID_SET(LEN, npages);
823                 tidlist[(*tididx)++] = tidinfo;
824                 grp->used++;
825                 grp->map |= 1 << useidx++;
826                 idx++;
827         }
828
829         /* Fill the rest of the group with "blank" writes */
830         for (; useidx < grp->size; useidx++)
831                 rcv_array_wc_fill(dd, grp->base + useidx);
832         *pmapped = mapped;
833         return idx;
834 }
835
836 static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr,
837                               u32 rcventry, struct tid_group *grp,
838                               struct page **pages, unsigned npages)
839 {
840         int ret;
841         struct hfi1_ctxtdata *uctxt = fd->uctxt;
842         struct tid_rb_node *node;
843         struct hfi1_devdata *dd = uctxt->dd;
844         dma_addr_t phys;
845
846         /*
847          * Allocate the node first so we can handle a potential
848          * failure before we've programmed anything.
849          */
850         node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
851                        GFP_KERNEL);
852         if (!node)
853                 return -ENOMEM;
854
855         phys = pci_map_single(dd->pcidev,
856                               __va(page_to_phys(pages[0])),
857                               npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
858         if (dma_mapping_error(&dd->pcidev->dev, phys)) {
859                 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
860                            phys);
861                 kfree(node);
862                 return -EFAULT;
863         }
864
865         node->mmu.addr = vaddr;
866         node->mmu.len = npages * PAGE_SIZE;
867         node->phys = page_to_phys(pages[0]);
868         node->npages = npages;
869         node->rcventry = rcventry;
870         node->dma_addr = phys;
871         node->grp = grp;
872         node->freed = false;
873         memcpy(node->pages, pages, sizeof(struct page *) * npages);
874
875         if (!fd->handler)
876                 ret = tid_rb_insert(fd, &node->mmu);
877         else
878                 ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
879
880         if (ret) {
881                 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
882                           node->rcventry, node->mmu.addr, node->phys, ret);
883                 pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
884                                  PCI_DMA_FROMDEVICE);
885                 kfree(node);
886                 return -EFAULT;
887         }
888         hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
889         trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
890                                node->mmu.addr, node->phys, phys);
891         return 0;
892 }
893
894 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
895                               struct tid_group **grp)
896 {
897         struct hfi1_ctxtdata *uctxt = fd->uctxt;
898         struct hfi1_devdata *dd = uctxt->dd;
899         struct tid_rb_node *node;
900         u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
901         u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
902
903         if (tididx >= uctxt->expected_count) {
904                 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
905                            tididx, uctxt->ctxt);
906                 return -EINVAL;
907         }
908
909         if (tidctrl == 0x3)
910                 return -EINVAL;
911
912         rcventry = tididx + (tidctrl - 1);
913
914         node = fd->entry_to_rb[rcventry];
915         if (!node || node->rcventry != (uctxt->expected_base + rcventry))
916                 return -EBADF;
917
918         if (grp)
919                 *grp = node->grp;
920
921         if (!fd->handler)
922                 cacheless_tid_rb_remove(fd, node);
923         else
924                 hfi1_mmu_rb_remove(fd->handler, &node->mmu);
925
926         return 0;
927 }
928
929 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
930 {
931         struct hfi1_ctxtdata *uctxt = fd->uctxt;
932         struct hfi1_devdata *dd = uctxt->dd;
933
934         trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
935                                  node->npages, node->mmu.addr, node->phys,
936                                  node->dma_addr);
937
938         hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
939         /*
940          * Make sure device has seen the write before we unpin the
941          * pages.
942          */
943         flush_wc();
944
945         pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
946                          PCI_DMA_FROMDEVICE);
947         hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
948         fd->tid_n_pinned -= node->npages;
949
950         node->grp->used--;
951         node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
952
953         if (node->grp->used == node->grp->size - 1)
954                 tid_group_move(node->grp, &uctxt->tid_full_list,
955                                &uctxt->tid_used_list);
956         else if (!node->grp->used)
957                 tid_group_move(node->grp, &uctxt->tid_used_list,
958                                &uctxt->tid_group_list);
959         kfree(node);
960 }
961
962 /*
963  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
964  * clearing nodes in the non-cached case.
965  */
966 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
967                             struct exp_tid_set *set,
968                             struct hfi1_filedata *fd)
969 {
970         struct tid_group *grp, *ptr;
971         int i;
972
973         list_for_each_entry_safe(grp, ptr, &set->list, list) {
974                 list_del_init(&grp->list);
975
976                 for (i = 0; i < grp->size; i++) {
977                         if (grp->map & (1 << i)) {
978                                 u16 rcventry = grp->base + i;
979                                 struct tid_rb_node *node;
980
981                                 node = fd->entry_to_rb[rcventry -
982                                                           uctxt->expected_base];
983                                 if (!node || node->rcventry != rcventry)
984                                         continue;
985
986                                 cacheless_tid_rb_remove(fd, node);
987                         }
988                 }
989         }
990 }
991
992 /*
993  * Always return 0 from this function.  A non-zero return indicates that the
994  * remove operation will be called and that memory should be unpinned.
995  * However, the driver cannot unpin out from under PSM.  Instead, retain the
996  * memory (by returning 0) and inform PSM that the memory is going away.  PSM
997  * will call back later when it has removed the memory from its list.
998  */
999 static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1000 {
1001         struct hfi1_filedata *fdata = arg;
1002         struct hfi1_ctxtdata *uctxt = fdata->uctxt;
1003         struct tid_rb_node *node =
1004                 container_of(mnode, struct tid_rb_node, mmu);
1005
1006         if (node->freed)
1007                 return 0;
1008
1009         trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
1010                                  node->rcventry, node->npages, node->dma_addr);
1011         node->freed = true;
1012
1013         spin_lock(&fdata->invalid_lock);
1014         if (fdata->invalid_tid_idx < uctxt->expected_count) {
1015                 fdata->invalid_tids[fdata->invalid_tid_idx] =
1016                         rcventry2tidinfo(node->rcventry - uctxt->expected_base);
1017                 fdata->invalid_tids[fdata->invalid_tid_idx] |=
1018                         EXP_TID_SET(LEN, node->npages);
1019                 if (!fdata->invalid_tid_idx) {
1020                         unsigned long *ev;
1021
1022                         /*
1023                          * hfi1_set_uevent_bits() sets a user event flag
1024                          * for all processes. Because calling into the
1025                          * driver to process TID cache invalidations is
1026                          * expensive and TID cache invalidations are
1027                          * handled on a per-process basis, we can
1028                          * optimize this to set the flag only for the
1029                          * process in question.
1030                          */
1031                         ev = uctxt->dd->events +
1032                           (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
1033                             HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
1034                         set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
1035                 }
1036                 fdata->invalid_tid_idx++;
1037         }
1038         spin_unlock(&fdata->invalid_lock);
1039         return 0;
1040 }
1041
1042 static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
1043 {
1044         struct hfi1_filedata *fdata = arg;
1045         struct tid_rb_node *tnode =
1046                 container_of(node, struct tid_rb_node, mmu);
1047         u32 base = fdata->uctxt->expected_base;
1048
1049         fdata->entry_to_rb[tnode->rcventry - base] = tnode;
1050         return 0;
1051 }
1052
1053 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
1054                                     struct tid_rb_node *tnode)
1055 {
1056         u32 base = fdata->uctxt->expected_base;
1057
1058         fdata->entry_to_rb[tnode->rcventry - base] = NULL;
1059         clear_tid_node(fdata, tnode);
1060 }
1061
1062 static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
1063 {
1064         struct hfi1_filedata *fdata = arg;
1065         struct tid_rb_node *tnode =
1066                 container_of(node, struct tid_rb_node, mmu);
1067
1068         cacheless_tid_rb_remove(fdata, tnode);
1069 }