]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
Merge remote-tracking branch 'dwmw2-iommu/master'
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/io.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <linux/dma-contiguous.h>
47 #include <linux/crash_dump.h>
48 #include <asm/irq_remapping.h>
49 #include <asm/cacheflush.h>
50 #include <asm/iommu.h>
51
52 #include "irq_remapping.h"
53
54 #define ROOT_SIZE               VTD_PAGE_SIZE
55 #define CONTEXT_SIZE            VTD_PAGE_SIZE
56
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61
62 #define IOAPIC_RANGE_START      (0xfee00000)
63 #define IOAPIC_RANGE_END        (0xfeefffff)
64 #define IOVA_START_ADDR         (0x1000)
65
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
77                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN          (1)
82
83 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
84 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
85 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
86
87 /* page table handling */
88 #define LEVEL_STRIDE            (9)
89 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
90
91 /*
92  * This bitmap is used to advertise the page sizes our hardware support
93  * to the IOMMU core, which will then use this information to split
94  * physically contiguous memory regions it is mapping into page sizes
95  * that we support.
96  *
97  * Traditionally the IOMMU core just handed us the mappings directly,
98  * after making sure the size is an order of a 4KiB page and that the
99  * mapping has natural alignment.
100  *
101  * To retain this behavior, we currently advertise that we support
102  * all page sizes that are an order of 4KiB.
103  *
104  * If at some point we'd like to utilize the IOMMU core's new behavior,
105  * we could change this to advertise the real page sizes we support.
106  */
107 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
108
109 static inline int agaw_to_level(int agaw)
110 {
111         return agaw + 2;
112 }
113
114 static inline int agaw_to_width(int agaw)
115 {
116         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
117 }
118
119 static inline int width_to_agaw(int width)
120 {
121         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
122 }
123
124 static inline unsigned int level_to_offset_bits(int level)
125 {
126         return (level - 1) * LEVEL_STRIDE;
127 }
128
129 static inline int pfn_level_offset(unsigned long pfn, int level)
130 {
131         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
132 }
133
134 static inline unsigned long level_mask(int level)
135 {
136         return -1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long level_size(int level)
140 {
141         return 1UL << level_to_offset_bits(level);
142 }
143
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 {
146         return (pfn + level_size(level) - 1) & level_mask(level);
147 }
148
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 {
151         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
152 }
153
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155    are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 {
158         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 {
163         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 }
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 {
167         return mm_to_dma_pfn(page_to_pfn(pg));
168 }
169 static inline unsigned long virt_to_dma_pfn(void *p)
170 {
171         return page_to_dma_pfn(virt_to_page(p));
172 }
173
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
176
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
179
180 /*
181  * set to 1 to panic kernel if can't successfully enable VT-d
182  * (used when kernel is launched w/ TXT)
183  */
184 static int force_on = 0;
185
186 /*
187  * 0: Present
188  * 1-11: Reserved
189  * 12-63: Context Ptr (12 - (haw-1))
190  * 64-127: Reserved
191  */
192 struct root_entry {
193         u64     lo;
194         u64     hi;
195 };
196 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
197
198 /*
199  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_lctp(struct root_entry *re)
203 {
204         if (!(re->lo & 1))
205                 return 0;
206
207         return re->lo & VTD_PAGE_MASK;
208 }
209
210 /*
211  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
212  * if marked present.
213  */
214 static phys_addr_t root_entry_uctp(struct root_entry *re)
215 {
216         if (!(re->hi & 1))
217                 return 0;
218
219         return re->hi & VTD_PAGE_MASK;
220 }
221 /*
222  * low 64 bits:
223  * 0: present
224  * 1: fault processing disable
225  * 2-3: translation type
226  * 12-63: address space root
227  * high 64 bits:
228  * 0-2: address width
229  * 3-6: aval
230  * 8-23: domain id
231  */
232 struct context_entry {
233         u64 lo;
234         u64 hi;
235 };
236
237 static inline void context_clear_pasid_enable(struct context_entry *context)
238 {
239         context->lo &= ~(1ULL << 11);
240 }
241
242 static inline bool context_pasid_enabled(struct context_entry *context)
243 {
244         return !!(context->lo & (1ULL << 11));
245 }
246
247 static inline void context_set_copied(struct context_entry *context)
248 {
249         context->hi |= (1ull << 3);
250 }
251
252 static inline bool context_copied(struct context_entry *context)
253 {
254         return !!(context->hi & (1ULL << 3));
255 }
256
257 static inline bool __context_present(struct context_entry *context)
258 {
259         return (context->lo & 1);
260 }
261
262 static inline bool context_present(struct context_entry *context)
263 {
264         return context_pasid_enabled(context) ?
265              __context_present(context) :
266              __context_present(context) && !context_copied(context);
267 }
268
269 static inline void context_set_present(struct context_entry *context)
270 {
271         context->lo |= 1;
272 }
273
274 static inline void context_set_fault_enable(struct context_entry *context)
275 {
276         context->lo &= (((u64)-1) << 2) | 1;
277 }
278
279 static inline void context_set_translation_type(struct context_entry *context,
280                                                 unsigned long value)
281 {
282         context->lo &= (((u64)-1) << 4) | 3;
283         context->lo |= (value & 3) << 2;
284 }
285
286 static inline void context_set_address_root(struct context_entry *context,
287                                             unsigned long value)
288 {
289         context->lo &= ~VTD_PAGE_MASK;
290         context->lo |= value & VTD_PAGE_MASK;
291 }
292
293 static inline void context_set_address_width(struct context_entry *context,
294                                              unsigned long value)
295 {
296         context->hi |= value & 7;
297 }
298
299 static inline void context_set_domain_id(struct context_entry *context,
300                                          unsigned long value)
301 {
302         context->hi |= (value & ((1 << 16) - 1)) << 8;
303 }
304
305 static inline int context_domain_id(struct context_entry *c)
306 {
307         return((c->hi >> 8) & 0xffff);
308 }
309
310 static inline void context_clear_entry(struct context_entry *context)
311 {
312         context->lo = 0;
313         context->hi = 0;
314 }
315
316 /*
317  * 0: readable
318  * 1: writable
319  * 2-6: reserved
320  * 7: super page
321  * 8-10: available
322  * 11: snoop behavior
323  * 12-63: Host physcial address
324  */
325 struct dma_pte {
326         u64 val;
327 };
328
329 static inline void dma_clear_pte(struct dma_pte *pte)
330 {
331         pte->val = 0;
332 }
333
334 static inline u64 dma_pte_addr(struct dma_pte *pte)
335 {
336 #ifdef CONFIG_64BIT
337         return pte->val & VTD_PAGE_MASK;
338 #else
339         /* Must have a full atomic 64-bit read */
340         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
341 #endif
342 }
343
344 static inline bool dma_pte_present(struct dma_pte *pte)
345 {
346         return (pte->val & 3) != 0;
347 }
348
349 static inline bool dma_pte_superpage(struct dma_pte *pte)
350 {
351         return (pte->val & DMA_PTE_LARGE_PAGE);
352 }
353
354 static inline int first_pte_in_page(struct dma_pte *pte)
355 {
356         return !((unsigned long)pte & ~VTD_PAGE_MASK);
357 }
358
359 /*
360  * This domain is a statically identity mapping domain.
361  *      1. This domain creats a static 1:1 mapping to all usable memory.
362  *      2. It maps to each iommu if successful.
363  *      3. Each iommu mapps to this domain if successful.
364  */
365 static struct dmar_domain *si_domain;
366 static int hw_pass_through = 1;
367
368 /*
369  * Domain represents a virtual machine, more than one devices
370  * across iommus may be owned in one domain, e.g. kvm guest.
371  */
372 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
373
374 /* si_domain contains mulitple devices */
375 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
376
377 #define for_each_domain_iommu(idx, domain)                      \
378         for (idx = 0; idx < g_num_of_iommus; idx++)             \
379                 if (domain->iommu_refcnt[idx])
380
381 struct dmar_domain {
382         int     nid;                    /* node id */
383
384         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
385                                         /* Refcount of devices per iommu */
386
387
388         u16             iommu_did[DMAR_UNITS_SUPPORTED];
389                                         /* Domain ids per IOMMU. Use u16 since
390                                          * domain ids are 16 bit wide according
391                                          * to VT-d spec, section 9.3 */
392
393         struct list_head devices;       /* all devices' list */
394         struct iova_domain iovad;       /* iova's that belong to this domain */
395
396         struct dma_pte  *pgd;           /* virtual address */
397         int             gaw;            /* max guest address width */
398
399         /* adjusted guest address width, 0 is level 2 30-bit */
400         int             agaw;
401
402         int             flags;          /* flags to find out type of domain */
403
404         int             iommu_coherency;/* indicate coherency of iommu access */
405         int             iommu_snooping; /* indicate snooping control feature*/
406         int             iommu_count;    /* reference count of iommu */
407         int             iommu_superpage;/* Level of superpages supported:
408                                            0 == 4KiB (no superpages), 1 == 2MiB,
409                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
410         u64             max_addr;       /* maximum mapped address */
411
412         struct iommu_domain domain;     /* generic domain data structure for
413                                            iommu core */
414 };
415
416 /* PCI domain-device relationship */
417 struct device_domain_info {
418         struct list_head link;  /* link to domain siblings */
419         struct list_head global; /* link to global list */
420         u8 bus;                 /* PCI bus number */
421         u8 devfn;               /* PCI devfn number */
422         u8 pasid_supported:3;
423         u8 pasid_enabled:1;
424         u8 pri_supported:1;
425         u8 pri_enabled:1;
426         u8 ats_supported:1;
427         u8 ats_enabled:1;
428         u8 ats_qdep;
429         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
430         struct intel_iommu *iommu; /* IOMMU used by this device */
431         struct dmar_domain *domain; /* pointer to domain */
432 };
433
434 struct dmar_rmrr_unit {
435         struct list_head list;          /* list of rmrr units   */
436         struct acpi_dmar_header *hdr;   /* ACPI header          */
437         u64     base_address;           /* reserved base address*/
438         u64     end_address;            /* reserved end address */
439         struct dmar_dev_scope *devices; /* target devices */
440         int     devices_cnt;            /* target device count */
441 };
442
443 struct dmar_atsr_unit {
444         struct list_head list;          /* list of ATSR units */
445         struct acpi_dmar_header *hdr;   /* ACPI header */
446         struct dmar_dev_scope *devices; /* target devices */
447         int devices_cnt;                /* target device count */
448         u8 include_all:1;               /* include all ports */
449 };
450
451 static LIST_HEAD(dmar_atsr_units);
452 static LIST_HEAD(dmar_rmrr_units);
453
454 #define for_each_rmrr_units(rmrr) \
455         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
456
457 static void flush_unmaps_timeout(unsigned long data);
458
459 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
460
461 #define HIGH_WATER_MARK 250
462 struct deferred_flush_tables {
463         int next;
464         struct iova *iova[HIGH_WATER_MARK];
465         struct dmar_domain *domain[HIGH_WATER_MARK];
466         struct page *freelist[HIGH_WATER_MARK];
467 };
468
469 static struct deferred_flush_tables *deferred_flush;
470
471 /* bitmap for indexing intel_iommus */
472 static int g_num_of_iommus;
473
474 static DEFINE_SPINLOCK(async_umap_flush_lock);
475 static LIST_HEAD(unmaps_to_do);
476
477 static int timer_on;
478 static long list_size;
479
480 static void domain_exit(struct dmar_domain *domain);
481 static void domain_remove_dev_info(struct dmar_domain *domain);
482 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
483                                      struct device *dev);
484 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
485 static void domain_context_clear(struct intel_iommu *iommu,
486                                  struct device *dev);
487 static int domain_detach_iommu(struct dmar_domain *domain,
488                                struct intel_iommu *iommu);
489
490 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
491 int dmar_disabled = 0;
492 #else
493 int dmar_disabled = 1;
494 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
495
496 int intel_iommu_enabled = 0;
497 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
498
499 static int dmar_map_gfx = 1;
500 static int dmar_forcedac;
501 static int intel_iommu_strict;
502 static int intel_iommu_superpage = 1;
503 static int intel_iommu_ecs = 1;
504 static int intel_iommu_pasid28;
505 static int iommu_identity_mapping;
506
507 #define IDENTMAP_ALL            1
508 #define IDENTMAP_GFX            2
509 #define IDENTMAP_AZALIA         4
510
511 /* Broadwell and Skylake have broken ECS support — normal so-called "second
512  * level" translation of DMA requests-without-PASID doesn't actually happen
513  * unless you also set the NESTE bit in an extended context-entry. Which of
514  * course means that SVM doesn't work because it's trying to do nested
515  * translation of the physical addresses it finds in the process page tables,
516  * through the IOVA->phys mapping found in the "second level" page tables.
517  *
518  * The VT-d specification was retroactively changed to change the definition
519  * of the capability bits and pretend that Broadwell/Skylake never happened...
520  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
521  * for some reason it was the PASID capability bit which was redefined (from
522  * bit 28 on BDW/SKL to bit 40 in future).
523  *
524  * So our test for ECS needs to eschew those implementations which set the old
525  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
526  * Unless we are working around the 'pasid28' limitations, that is, by putting
527  * the device into passthrough mode for normal DMA and thus masking the bug.
528  */
529 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
530                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
531 /* PASID support is thus enabled if ECS is enabled and *either* of the old
532  * or new capability bits are set. */
533 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
534                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
535
536 int intel_iommu_gfx_mapped;
537 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
538
539 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
540 static DEFINE_SPINLOCK(device_domain_lock);
541 static LIST_HEAD(device_domain_list);
542
543 static const struct iommu_ops intel_iommu_ops;
544
545 static bool translation_pre_enabled(struct intel_iommu *iommu)
546 {
547         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
548 }
549
550 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
551 {
552         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
553 }
554
555 static void init_translation_status(struct intel_iommu *iommu)
556 {
557         u32 gsts;
558
559         gsts = readl(iommu->reg + DMAR_GSTS_REG);
560         if (gsts & DMA_GSTS_TES)
561                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
562 }
563
564 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
565 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
566 {
567         return container_of(dom, struct dmar_domain, domain);
568 }
569
570 static int __init intel_iommu_setup(char *str)
571 {
572         if (!str)
573                 return -EINVAL;
574         while (*str) {
575                 if (!strncmp(str, "on", 2)) {
576                         dmar_disabled = 0;
577                         pr_info("IOMMU enabled\n");
578                 } else if (!strncmp(str, "off", 3)) {
579                         dmar_disabled = 1;
580                         pr_info("IOMMU disabled\n");
581                 } else if (!strncmp(str, "igfx_off", 8)) {
582                         dmar_map_gfx = 0;
583                         pr_info("Disable GFX device mapping\n");
584                 } else if (!strncmp(str, "forcedac", 8)) {
585                         pr_info("Forcing DAC for PCI devices\n");
586                         dmar_forcedac = 1;
587                 } else if (!strncmp(str, "strict", 6)) {
588                         pr_info("Disable batched IOTLB flush\n");
589                         intel_iommu_strict = 1;
590                 } else if (!strncmp(str, "sp_off", 6)) {
591                         pr_info("Disable supported super page\n");
592                         intel_iommu_superpage = 0;
593                 } else if (!strncmp(str, "ecs_off", 7)) {
594                         printk(KERN_INFO
595                                 "Intel-IOMMU: disable extended context table support\n");
596                         intel_iommu_ecs = 0;
597                 } else if (!strncmp(str, "pasid28", 7)) {
598                         printk(KERN_INFO
599                                 "Intel-IOMMU: enable pre-production PASID support\n");
600                         intel_iommu_pasid28 = 1;
601                         iommu_identity_mapping |= IDENTMAP_GFX;
602                 }
603
604                 str += strcspn(str, ",");
605                 while (*str == ',')
606                         str++;
607         }
608         return 0;
609 }
610 __setup("intel_iommu=", intel_iommu_setup);
611
612 static struct kmem_cache *iommu_domain_cache;
613 static struct kmem_cache *iommu_devinfo_cache;
614
615 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
616 {
617         struct dmar_domain **domains;
618         int idx = did >> 8;
619
620         domains = iommu->domains[idx];
621         if (!domains)
622                 return NULL;
623
624         return domains[did & 0xff];
625 }
626
627 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
628                              struct dmar_domain *domain)
629 {
630         struct dmar_domain **domains;
631         int idx = did >> 8;
632
633         if (!iommu->domains[idx]) {
634                 size_t size = 256 * sizeof(struct dmar_domain *);
635                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
636         }
637
638         domains = iommu->domains[idx];
639         if (WARN_ON(!domains))
640                 return;
641         else
642                 domains[did & 0xff] = domain;
643 }
644
645 static inline void *alloc_pgtable_page(int node)
646 {
647         struct page *page;
648         void *vaddr = NULL;
649
650         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
651         if (page)
652                 vaddr = page_address(page);
653         return vaddr;
654 }
655
656 static inline void free_pgtable_page(void *vaddr)
657 {
658         free_page((unsigned long)vaddr);
659 }
660
661 static inline void *alloc_domain_mem(void)
662 {
663         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
664 }
665
666 static void free_domain_mem(void *vaddr)
667 {
668         kmem_cache_free(iommu_domain_cache, vaddr);
669 }
670
671 static inline void * alloc_devinfo_mem(void)
672 {
673         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
674 }
675
676 static inline void free_devinfo_mem(void *vaddr)
677 {
678         kmem_cache_free(iommu_devinfo_cache, vaddr);
679 }
680
681 static inline int domain_type_is_vm(struct dmar_domain *domain)
682 {
683         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
684 }
685
686 static inline int domain_type_is_si(struct dmar_domain *domain)
687 {
688         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
689 }
690
691 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
692 {
693         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
694                                 DOMAIN_FLAG_STATIC_IDENTITY);
695 }
696
697 static inline int domain_pfn_supported(struct dmar_domain *domain,
698                                        unsigned long pfn)
699 {
700         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
701
702         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
703 }
704
705 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
706 {
707         unsigned long sagaw;
708         int agaw = -1;
709
710         sagaw = cap_sagaw(iommu->cap);
711         for (agaw = width_to_agaw(max_gaw);
712              agaw >= 0; agaw--) {
713                 if (test_bit(agaw, &sagaw))
714                         break;
715         }
716
717         return agaw;
718 }
719
720 /*
721  * Calculate max SAGAW for each iommu.
722  */
723 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
724 {
725         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
726 }
727
728 /*
729  * calculate agaw for each iommu.
730  * "SAGAW" may be different across iommus, use a default agaw, and
731  * get a supported less agaw for iommus that don't support the default agaw.
732  */
733 int iommu_calculate_agaw(struct intel_iommu *iommu)
734 {
735         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
736 }
737
738 /* This functionin only returns single iommu in a domain */
739 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
740 {
741         int iommu_id;
742
743         /* si_domain and vm domain should not get here. */
744         BUG_ON(domain_type_is_vm_or_si(domain));
745         for_each_domain_iommu(iommu_id, domain)
746                 break;
747
748         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
749                 return NULL;
750
751         return g_iommus[iommu_id];
752 }
753
754 static void domain_update_iommu_coherency(struct dmar_domain *domain)
755 {
756         struct dmar_drhd_unit *drhd;
757         struct intel_iommu *iommu;
758         bool found = false;
759         int i;
760
761         domain->iommu_coherency = 1;
762
763         for_each_domain_iommu(i, domain) {
764                 found = true;
765                 if (!ecap_coherent(g_iommus[i]->ecap)) {
766                         domain->iommu_coherency = 0;
767                         break;
768                 }
769         }
770         if (found)
771                 return;
772
773         /* No hardware attached; use lowest common denominator */
774         rcu_read_lock();
775         for_each_active_iommu(iommu, drhd) {
776                 if (!ecap_coherent(iommu->ecap)) {
777                         domain->iommu_coherency = 0;
778                         break;
779                 }
780         }
781         rcu_read_unlock();
782 }
783
784 static int domain_update_iommu_snooping(struct intel_iommu *skip)
785 {
786         struct dmar_drhd_unit *drhd;
787         struct intel_iommu *iommu;
788         int ret = 1;
789
790         rcu_read_lock();
791         for_each_active_iommu(iommu, drhd) {
792                 if (iommu != skip) {
793                         if (!ecap_sc_support(iommu->ecap)) {
794                                 ret = 0;
795                                 break;
796                         }
797                 }
798         }
799         rcu_read_unlock();
800
801         return ret;
802 }
803
804 static int domain_update_iommu_superpage(struct intel_iommu *skip)
805 {
806         struct dmar_drhd_unit *drhd;
807         struct intel_iommu *iommu;
808         int mask = 0xf;
809
810         if (!intel_iommu_superpage) {
811                 return 0;
812         }
813
814         /* set iommu_superpage to the smallest common denominator */
815         rcu_read_lock();
816         for_each_active_iommu(iommu, drhd) {
817                 if (iommu != skip) {
818                         mask &= cap_super_page_val(iommu->cap);
819                         if (!mask)
820                                 break;
821                 }
822         }
823         rcu_read_unlock();
824
825         return fls(mask);
826 }
827
828 /* Some capabilities may be different across iommus */
829 static void domain_update_iommu_cap(struct dmar_domain *domain)
830 {
831         domain_update_iommu_coherency(domain);
832         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
833         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
834 }
835
836 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
837                                                        u8 bus, u8 devfn, int alloc)
838 {
839         struct root_entry *root = &iommu->root_entry[bus];
840         struct context_entry *context;
841         u64 *entry;
842
843         entry = &root->lo;
844         if (ecs_enabled(iommu)) {
845                 if (devfn >= 0x80) {
846                         devfn -= 0x80;
847                         entry = &root->hi;
848                 }
849                 devfn *= 2;
850         }
851         if (*entry & 1)
852                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
853         else {
854                 unsigned long phy_addr;
855                 if (!alloc)
856                         return NULL;
857
858                 context = alloc_pgtable_page(iommu->node);
859                 if (!context)
860                         return NULL;
861
862                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
863                 phy_addr = virt_to_phys((void *)context);
864                 *entry = phy_addr | 1;
865                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
866         }
867         return &context[devfn];
868 }
869
870 static int iommu_dummy(struct device *dev)
871 {
872         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
873 }
874
875 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
876 {
877         struct dmar_drhd_unit *drhd = NULL;
878         struct intel_iommu *iommu;
879         struct device *tmp;
880         struct pci_dev *ptmp, *pdev = NULL;
881         u16 segment = 0;
882         int i;
883
884         if (iommu_dummy(dev))
885                 return NULL;
886
887         if (dev_is_pci(dev)) {
888                 pdev = to_pci_dev(dev);
889                 segment = pci_domain_nr(pdev->bus);
890         } else if (has_acpi_companion(dev))
891                 dev = &ACPI_COMPANION(dev)->dev;
892
893         rcu_read_lock();
894         for_each_active_iommu(iommu, drhd) {
895                 if (pdev && segment != drhd->segment)
896                         continue;
897
898                 for_each_active_dev_scope(drhd->devices,
899                                           drhd->devices_cnt, i, tmp) {
900                         if (tmp == dev) {
901                                 *bus = drhd->devices[i].bus;
902                                 *devfn = drhd->devices[i].devfn;
903                                 goto out;
904                         }
905
906                         if (!pdev || !dev_is_pci(tmp))
907                                 continue;
908
909                         ptmp = to_pci_dev(tmp);
910                         if (ptmp->subordinate &&
911                             ptmp->subordinate->number <= pdev->bus->number &&
912                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
913                                 goto got_pdev;
914                 }
915
916                 if (pdev && drhd->include_all) {
917                 got_pdev:
918                         *bus = pdev->bus->number;
919                         *devfn = pdev->devfn;
920                         goto out;
921                 }
922         }
923         iommu = NULL;
924  out:
925         rcu_read_unlock();
926
927         return iommu;
928 }
929
930 static void domain_flush_cache(struct dmar_domain *domain,
931                                void *addr, int size)
932 {
933         if (!domain->iommu_coherency)
934                 clflush_cache_range(addr, size);
935 }
936
937 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
938 {
939         struct context_entry *context;
940         int ret = 0;
941         unsigned long flags;
942
943         spin_lock_irqsave(&iommu->lock, flags);
944         context = iommu_context_addr(iommu, bus, devfn, 0);
945         if (context)
946                 ret = context_present(context);
947         spin_unlock_irqrestore(&iommu->lock, flags);
948         return ret;
949 }
950
951 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
952 {
953         struct context_entry *context;
954         unsigned long flags;
955
956         spin_lock_irqsave(&iommu->lock, flags);
957         context = iommu_context_addr(iommu, bus, devfn, 0);
958         if (context) {
959                 context_clear_entry(context);
960                 __iommu_flush_cache(iommu, context, sizeof(*context));
961         }
962         spin_unlock_irqrestore(&iommu->lock, flags);
963 }
964
965 static void free_context_table(struct intel_iommu *iommu)
966 {
967         int i;
968         unsigned long flags;
969         struct context_entry *context;
970
971         spin_lock_irqsave(&iommu->lock, flags);
972         if (!iommu->root_entry) {
973                 goto out;
974         }
975         for (i = 0; i < ROOT_ENTRY_NR; i++) {
976                 context = iommu_context_addr(iommu, i, 0, 0);
977                 if (context)
978                         free_pgtable_page(context);
979
980                 if (!ecs_enabled(iommu))
981                         continue;
982
983                 context = iommu_context_addr(iommu, i, 0x80, 0);
984                 if (context)
985                         free_pgtable_page(context);
986
987         }
988         free_pgtable_page(iommu->root_entry);
989         iommu->root_entry = NULL;
990 out:
991         spin_unlock_irqrestore(&iommu->lock, flags);
992 }
993
994 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
995                                       unsigned long pfn, int *target_level)
996 {
997         struct dma_pte *parent, *pte = NULL;
998         int level = agaw_to_level(domain->agaw);
999         int offset;
1000
1001         BUG_ON(!domain->pgd);
1002
1003         if (!domain_pfn_supported(domain, pfn))
1004                 /* Address beyond IOMMU's addressing capabilities. */
1005                 return NULL;
1006
1007         parent = domain->pgd;
1008
1009         while (1) {
1010                 void *tmp_page;
1011
1012                 offset = pfn_level_offset(pfn, level);
1013                 pte = &parent[offset];
1014                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1015                         break;
1016                 if (level == *target_level)
1017                         break;
1018
1019                 if (!dma_pte_present(pte)) {
1020                         uint64_t pteval;
1021
1022                         tmp_page = alloc_pgtable_page(domain->nid);
1023
1024                         if (!tmp_page)
1025                                 return NULL;
1026
1027                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1028                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1029                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1030                                 /* Someone else set it while we were thinking; use theirs. */
1031                                 free_pgtable_page(tmp_page);
1032                         else
1033                                 domain_flush_cache(domain, pte, sizeof(*pte));
1034                 }
1035                 if (level == 1)
1036                         break;
1037
1038                 parent = phys_to_virt(dma_pte_addr(pte));
1039                 level--;
1040         }
1041
1042         if (!*target_level)
1043                 *target_level = level;
1044
1045         return pte;
1046 }
1047
1048
1049 /* return address's pte at specific level */
1050 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1051                                          unsigned long pfn,
1052                                          int level, int *large_page)
1053 {
1054         struct dma_pte *parent, *pte = NULL;
1055         int total = agaw_to_level(domain->agaw);
1056         int offset;
1057
1058         parent = domain->pgd;
1059         while (level <= total) {
1060                 offset = pfn_level_offset(pfn, total);
1061                 pte = &parent[offset];
1062                 if (level == total)
1063                         return pte;
1064
1065                 if (!dma_pte_present(pte)) {
1066                         *large_page = total;
1067                         break;
1068                 }
1069
1070                 if (dma_pte_superpage(pte)) {
1071                         *large_page = total;
1072                         return pte;
1073                 }
1074
1075                 parent = phys_to_virt(dma_pte_addr(pte));
1076                 total--;
1077         }
1078         return NULL;
1079 }
1080
1081 /* clear last level pte, a tlb flush should be followed */
1082 static void dma_pte_clear_range(struct dmar_domain *domain,
1083                                 unsigned long start_pfn,
1084                                 unsigned long last_pfn)
1085 {
1086         unsigned int large_page = 1;
1087         struct dma_pte *first_pte, *pte;
1088
1089         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1090         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1091         BUG_ON(start_pfn > last_pfn);
1092
1093         /* we don't need lock here; nobody else touches the iova range */
1094         do {
1095                 large_page = 1;
1096                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1097                 if (!pte) {
1098                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1099                         continue;
1100                 }
1101                 do {
1102                         dma_clear_pte(pte);
1103                         start_pfn += lvl_to_nr_pages(large_page);
1104                         pte++;
1105                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1106
1107                 domain_flush_cache(domain, first_pte,
1108                                    (void *)pte - (void *)first_pte);
1109
1110         } while (start_pfn && start_pfn <= last_pfn);
1111 }
1112
1113 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1114                                struct dma_pte *pte, unsigned long pfn,
1115                                unsigned long start_pfn, unsigned long last_pfn)
1116 {
1117         pfn = max(start_pfn, pfn);
1118         pte = &pte[pfn_level_offset(pfn, level)];
1119
1120         do {
1121                 unsigned long level_pfn;
1122                 struct dma_pte *level_pte;
1123
1124                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1125                         goto next;
1126
1127                 level_pfn = pfn & level_mask(level - 1);
1128                 level_pte = phys_to_virt(dma_pte_addr(pte));
1129
1130                 if (level > 2)
1131                         dma_pte_free_level(domain, level - 1, level_pte,
1132                                            level_pfn, start_pfn, last_pfn);
1133
1134                 /* If range covers entire pagetable, free it */
1135                 if (!(start_pfn > level_pfn ||
1136                       last_pfn < level_pfn + level_size(level) - 1)) {
1137                         dma_clear_pte(pte);
1138                         domain_flush_cache(domain, pte, sizeof(*pte));
1139                         free_pgtable_page(level_pte);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 }
1145
1146 /* free page table pages. last level pte should already be cleared */
1147 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1148                                    unsigned long start_pfn,
1149                                    unsigned long last_pfn)
1150 {
1151         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1152         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1153         BUG_ON(start_pfn > last_pfn);
1154
1155         dma_pte_clear_range(domain, start_pfn, last_pfn);
1156
1157         /* We don't need lock here; nobody else touches the iova range */
1158         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1159                            domain->pgd, 0, start_pfn, last_pfn);
1160
1161         /* free pgd */
1162         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1163                 free_pgtable_page(domain->pgd);
1164                 domain->pgd = NULL;
1165         }
1166 }
1167
1168 /* When a page at a given level is being unlinked from its parent, we don't
1169    need to *modify* it at all. All we need to do is make a list of all the
1170    pages which can be freed just as soon as we've flushed the IOTLB and we
1171    know the hardware page-walk will no longer touch them.
1172    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1173    be freed. */
1174 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1175                                             int level, struct dma_pte *pte,
1176                                             struct page *freelist)
1177 {
1178         struct page *pg;
1179
1180         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1181         pg->freelist = freelist;
1182         freelist = pg;
1183
1184         if (level == 1)
1185                 return freelist;
1186
1187         pte = page_address(pg);
1188         do {
1189                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1190                         freelist = dma_pte_list_pagetables(domain, level - 1,
1191                                                            pte, freelist);
1192                 pte++;
1193         } while (!first_pte_in_page(pte));
1194
1195         return freelist;
1196 }
1197
1198 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1199                                         struct dma_pte *pte, unsigned long pfn,
1200                                         unsigned long start_pfn,
1201                                         unsigned long last_pfn,
1202                                         struct page *freelist)
1203 {
1204         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1205
1206         pfn = max(start_pfn, pfn);
1207         pte = &pte[pfn_level_offset(pfn, level)];
1208
1209         do {
1210                 unsigned long level_pfn;
1211
1212                 if (!dma_pte_present(pte))
1213                         goto next;
1214
1215                 level_pfn = pfn & level_mask(level);
1216
1217                 /* If range covers entire pagetable, free it */
1218                 if (start_pfn <= level_pfn &&
1219                     last_pfn >= level_pfn + level_size(level) - 1) {
1220                         /* These suborbinate page tables are going away entirely. Don't
1221                            bother to clear them; we're just going to *free* them. */
1222                         if (level > 1 && !dma_pte_superpage(pte))
1223                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1224
1225                         dma_clear_pte(pte);
1226                         if (!first_pte)
1227                                 first_pte = pte;
1228                         last_pte = pte;
1229                 } else if (level > 1) {
1230                         /* Recurse down into a level that isn't *entirely* obsolete */
1231                         freelist = dma_pte_clear_level(domain, level - 1,
1232                                                        phys_to_virt(dma_pte_addr(pte)),
1233                                                        level_pfn, start_pfn, last_pfn,
1234                                                        freelist);
1235                 }
1236 next:
1237                 pfn += level_size(level);
1238         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1239
1240         if (first_pte)
1241                 domain_flush_cache(domain, first_pte,
1242                                    (void *)++last_pte - (void *)first_pte);
1243
1244         return freelist;
1245 }
1246
1247 /* We can't just free the pages because the IOMMU may still be walking
1248    the page tables, and may have cached the intermediate levels. The
1249    pages can only be freed after the IOTLB flush has been done. */
1250 static struct page *domain_unmap(struct dmar_domain *domain,
1251                                  unsigned long start_pfn,
1252                                  unsigned long last_pfn)
1253 {
1254         struct page *freelist = NULL;
1255
1256         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1257         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1258         BUG_ON(start_pfn > last_pfn);
1259
1260         /* we don't need lock here; nobody else touches the iova range */
1261         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1262                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1263
1264         /* free pgd */
1265         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1266                 struct page *pgd_page = virt_to_page(domain->pgd);
1267                 pgd_page->freelist = freelist;
1268                 freelist = pgd_page;
1269
1270                 domain->pgd = NULL;
1271         }
1272
1273         return freelist;
1274 }
1275
1276 static void dma_free_pagelist(struct page *freelist)
1277 {
1278         struct page *pg;
1279
1280         while ((pg = freelist)) {
1281                 freelist = pg->freelist;
1282                 free_pgtable_page(page_address(pg));
1283         }
1284 }
1285
1286 /* iommu handling */
1287 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1288 {
1289         struct root_entry *root;
1290         unsigned long flags;
1291
1292         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1293         if (!root) {
1294                 pr_err("Allocating root entry for %s failed\n",
1295                         iommu->name);
1296                 return -ENOMEM;
1297         }
1298
1299         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1300
1301         spin_lock_irqsave(&iommu->lock, flags);
1302         iommu->root_entry = root;
1303         spin_unlock_irqrestore(&iommu->lock, flags);
1304
1305         return 0;
1306 }
1307
1308 static void iommu_set_root_entry(struct intel_iommu *iommu)
1309 {
1310         u64 addr;
1311         u32 sts;
1312         unsigned long flag;
1313
1314         addr = virt_to_phys(iommu->root_entry);
1315         if (ecs_enabled(iommu))
1316                 addr |= DMA_RTADDR_RTT;
1317
1318         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1319         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1320
1321         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1322
1323         /* Make sure hardware complete it */
1324         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1325                       readl, (sts & DMA_GSTS_RTPS), sts);
1326
1327         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1328 }
1329
1330 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1331 {
1332         u32 val;
1333         unsigned long flag;
1334
1335         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1336                 return;
1337
1338         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1340
1341         /* Make sure hardware complete it */
1342         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1343                       readl, (!(val & DMA_GSTS_WBFS)), val);
1344
1345         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346 }
1347
1348 /* return value determine if we need a write buffer flush */
1349 static void __iommu_flush_context(struct intel_iommu *iommu,
1350                                   u16 did, u16 source_id, u8 function_mask,
1351                                   u64 type)
1352 {
1353         u64 val = 0;
1354         unsigned long flag;
1355
1356         switch (type) {
1357         case DMA_CCMD_GLOBAL_INVL:
1358                 val = DMA_CCMD_GLOBAL_INVL;
1359                 break;
1360         case DMA_CCMD_DOMAIN_INVL:
1361                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1362                 break;
1363         case DMA_CCMD_DEVICE_INVL:
1364                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1365                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1366                 break;
1367         default:
1368                 BUG();
1369         }
1370         val |= DMA_CCMD_ICC;
1371
1372         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1373         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1374
1375         /* Make sure hardware complete it */
1376         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1377                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1378
1379         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1380 }
1381
1382 /* return value determine if we need a write buffer flush */
1383 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1384                                 u64 addr, unsigned int size_order, u64 type)
1385 {
1386         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1387         u64 val = 0, val_iva = 0;
1388         unsigned long flag;
1389
1390         switch (type) {
1391         case DMA_TLB_GLOBAL_FLUSH:
1392                 /* global flush doesn't need set IVA_REG */
1393                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1394                 break;
1395         case DMA_TLB_DSI_FLUSH:
1396                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1397                 break;
1398         case DMA_TLB_PSI_FLUSH:
1399                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1400                 /* IH bit is passed in as part of address */
1401                 val_iva = size_order | addr;
1402                 break;
1403         default:
1404                 BUG();
1405         }
1406         /* Note: set drain read/write */
1407 #if 0
1408         /*
1409          * This is probably to be super secure.. Looks like we can
1410          * ignore it without any impact.
1411          */
1412         if (cap_read_drain(iommu->cap))
1413                 val |= DMA_TLB_READ_DRAIN;
1414 #endif
1415         if (cap_write_drain(iommu->cap))
1416                 val |= DMA_TLB_WRITE_DRAIN;
1417
1418         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1419         /* Note: Only uses first TLB reg currently */
1420         if (val_iva)
1421                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1422         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1423
1424         /* Make sure hardware complete it */
1425         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1426                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1427
1428         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1429
1430         /* check IOTLB invalidation granularity */
1431         if (DMA_TLB_IAIG(val) == 0)
1432                 pr_err("Flush IOTLB failed\n");
1433         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1434                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1435                         (unsigned long long)DMA_TLB_IIRG(type),
1436                         (unsigned long long)DMA_TLB_IAIG(val));
1437 }
1438
1439 static struct device_domain_info *
1440 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1441                          u8 bus, u8 devfn)
1442 {
1443         struct device_domain_info *info;
1444
1445         assert_spin_locked(&device_domain_lock);
1446
1447         if (!iommu->qi)
1448                 return NULL;
1449
1450         list_for_each_entry(info, &domain->devices, link)
1451                 if (info->iommu == iommu && info->bus == bus &&
1452                     info->devfn == devfn) {
1453                         if (info->ats_supported && info->dev)
1454                                 return info;
1455                         break;
1456                 }
1457
1458         return NULL;
1459 }
1460
1461 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1462 {
1463         struct pci_dev *pdev;
1464
1465         if (!info || !dev_is_pci(info->dev))
1466                 return;
1467
1468         pdev = to_pci_dev(info->dev);
1469
1470 #ifdef CONFIG_INTEL_IOMMU_SVM
1471         /* The PCIe spec, in its wisdom, declares that the behaviour of
1472            the device if you enable PASID support after ATS support is
1473            undefined. So always enable PASID support on devices which
1474            have it, even if we can't yet know if we're ever going to
1475            use it. */
1476         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1477                 info->pasid_enabled = 1;
1478
1479         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1480                 info->pri_enabled = 1;
1481 #endif
1482         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1483                 info->ats_enabled = 1;
1484                 info->ats_qdep = pci_ats_queue_depth(pdev);
1485         }
1486 }
1487
1488 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1489 {
1490         struct pci_dev *pdev;
1491
1492         if (dev_is_pci(info->dev))
1493                 return;
1494
1495         pdev = to_pci_dev(info->dev);
1496
1497         if (info->ats_enabled) {
1498                 pci_disable_ats(pdev);
1499                 info->ats_enabled = 0;
1500         }
1501 #ifdef CONFIG_INTEL_IOMMU_SVM
1502         if (info->pri_enabled) {
1503                 pci_disable_pri(pdev);
1504                 info->pri_enabled = 0;
1505         }
1506         if (info->pasid_enabled) {
1507                 pci_disable_pasid(pdev);
1508                 info->pasid_enabled = 0;
1509         }
1510 #endif
1511 }
1512
1513 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1514                                   u64 addr, unsigned mask)
1515 {
1516         u16 sid, qdep;
1517         unsigned long flags;
1518         struct device_domain_info *info;
1519
1520         spin_lock_irqsave(&device_domain_lock, flags);
1521         list_for_each_entry(info, &domain->devices, link) {
1522                 if (!info->ats_enabled)
1523                         continue;
1524
1525                 sid = info->bus << 8 | info->devfn;
1526                 qdep = info->ats_qdep;
1527                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1528         }
1529         spin_unlock_irqrestore(&device_domain_lock, flags);
1530 }
1531
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533                                   struct dmar_domain *domain,
1534                                   unsigned long pfn, unsigned int pages,
1535                                   int ih, int map)
1536 {
1537         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539         u16 did = domain->iommu_did[iommu->seq_id];
1540
1541         BUG_ON(pages == 0);
1542
1543         if (ih)
1544                 ih = 1 << 6;
1545         /*
1546          * Fallback to domain selective flush if no PSI support or the size is
1547          * too big.
1548          * PSI requires page size to be 2 ^ x, and the base address is naturally
1549          * aligned to the size
1550          */
1551         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1552                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553                                                 DMA_TLB_DSI_FLUSH);
1554         else
1555                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1556                                                 DMA_TLB_PSI_FLUSH);
1557
1558         /*
1559          * In caching mode, changes of pages from non-present to present require
1560          * flush. However, device IOTLB doesn't need to be flushed in this case.
1561          */
1562         if (!cap_caching_mode(iommu->cap) || !map)
1563                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1564                                       addr, mask);
1565 }
1566
1567 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1568 {
1569         u32 pmen;
1570         unsigned long flags;
1571
1572         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1573         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1574         pmen &= ~DMA_PMEN_EPM;
1575         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1576
1577         /* wait for the protected region status bit to clear */
1578         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1579                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1580
1581         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1582 }
1583
1584 static void iommu_enable_translation(struct intel_iommu *iommu)
1585 {
1586         u32 sts;
1587         unsigned long flags;
1588
1589         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1590         iommu->gcmd |= DMA_GCMD_TE;
1591         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1592
1593         /* Make sure hardware complete it */
1594         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1595                       readl, (sts & DMA_GSTS_TES), sts);
1596
1597         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1598 }
1599
1600 static void iommu_disable_translation(struct intel_iommu *iommu)
1601 {
1602         u32 sts;
1603         unsigned long flag;
1604
1605         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1606         iommu->gcmd &= ~DMA_GCMD_TE;
1607         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1608
1609         /* Make sure hardware complete it */
1610         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1611                       readl, (!(sts & DMA_GSTS_TES)), sts);
1612
1613         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1614 }
1615
1616
1617 static int iommu_init_domains(struct intel_iommu *iommu)
1618 {
1619         u32 ndomains, nlongs;
1620         size_t size;
1621
1622         ndomains = cap_ndoms(iommu->cap);
1623         pr_debug("%s: Number of Domains supported <%d>\n",
1624                  iommu->name, ndomains);
1625         nlongs = BITS_TO_LONGS(ndomains);
1626
1627         spin_lock_init(&iommu->lock);
1628
1629         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1630         if (!iommu->domain_ids) {
1631                 pr_err("%s: Allocating domain id array failed\n",
1632                        iommu->name);
1633                 return -ENOMEM;
1634         }
1635
1636         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1637         iommu->domains = kzalloc(size, GFP_KERNEL);
1638
1639         if (iommu->domains) {
1640                 size = 256 * sizeof(struct dmar_domain *);
1641                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1642         }
1643
1644         if (!iommu->domains || !iommu->domains[0]) {
1645                 pr_err("%s: Allocating domain array failed\n",
1646                        iommu->name);
1647                 kfree(iommu->domain_ids);
1648                 kfree(iommu->domains);
1649                 iommu->domain_ids = NULL;
1650                 iommu->domains    = NULL;
1651                 return -ENOMEM;
1652         }
1653
1654
1655
1656         /*
1657          * If Caching mode is set, then invalid translations are tagged
1658          * with domain-id 0, hence we need to pre-allocate it. We also
1659          * use domain-id 0 as a marker for non-allocated domain-id, so
1660          * make sure it is not used for a real domain.
1661          */
1662         set_bit(0, iommu->domain_ids);
1663
1664         return 0;
1665 }
1666
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 {
1669         struct device_domain_info *info, *tmp;
1670         unsigned long flags;
1671
1672         if (!iommu->domains || !iommu->domain_ids)
1673                 return;
1674
1675         spin_lock_irqsave(&device_domain_lock, flags);
1676         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1677                 struct dmar_domain *domain;
1678
1679                 if (info->iommu != iommu)
1680                         continue;
1681
1682                 if (!info->dev || !info->domain)
1683                         continue;
1684
1685                 domain = info->domain;
1686
1687                 dmar_remove_one_dev_info(domain, info->dev);
1688
1689                 if (!domain_type_is_vm_or_si(domain))
1690                         domain_exit(domain);
1691         }
1692         spin_unlock_irqrestore(&device_domain_lock, flags);
1693
1694         if (iommu->gcmd & DMA_GCMD_TE)
1695                 iommu_disable_translation(iommu);
1696 }
1697
1698 static void free_dmar_iommu(struct intel_iommu *iommu)
1699 {
1700         if ((iommu->domains) && (iommu->domain_ids)) {
1701                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1702                 int i;
1703
1704                 for (i = 0; i < elems; i++)
1705                         kfree(iommu->domains[i]);
1706                 kfree(iommu->domains);
1707                 kfree(iommu->domain_ids);
1708                 iommu->domains = NULL;
1709                 iommu->domain_ids = NULL;
1710         }
1711
1712         g_iommus[iommu->seq_id] = NULL;
1713
1714         /* free context mapping */
1715         free_context_table(iommu);
1716
1717 #ifdef CONFIG_INTEL_IOMMU_SVM
1718         if (pasid_enabled(iommu)) {
1719                 if (ecap_prs(iommu->ecap))
1720                         intel_svm_finish_prq(iommu);
1721                 intel_svm_free_pasid_tables(iommu);
1722         }
1723 #endif
1724 }
1725
1726 static struct dmar_domain *alloc_domain(int flags)
1727 {
1728         struct dmar_domain *domain;
1729
1730         domain = alloc_domain_mem();
1731         if (!domain)
1732                 return NULL;
1733
1734         memset(domain, 0, sizeof(*domain));
1735         domain->nid = -1;
1736         domain->flags = flags;
1737         INIT_LIST_HEAD(&domain->devices);
1738
1739         return domain;
1740 }
1741
1742 /* Must be called with iommu->lock */
1743 static int domain_attach_iommu(struct dmar_domain *domain,
1744                                struct intel_iommu *iommu)
1745 {
1746         unsigned long ndomains;
1747         int num;
1748
1749         assert_spin_locked(&device_domain_lock);
1750         assert_spin_locked(&iommu->lock);
1751
1752         domain->iommu_refcnt[iommu->seq_id] += 1;
1753         domain->iommu_count += 1;
1754         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1755                 ndomains = cap_ndoms(iommu->cap);
1756                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1757
1758                 if (num >= ndomains) {
1759                         pr_err("%s: No free domain ids\n", iommu->name);
1760                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1761                         domain->iommu_count -= 1;
1762                         return -ENOSPC;
1763                 }
1764
1765                 set_bit(num, iommu->domain_ids);
1766                 set_iommu_domain(iommu, num, domain);
1767
1768                 domain->iommu_did[iommu->seq_id] = num;
1769                 domain->nid                      = iommu->node;
1770
1771                 domain_update_iommu_cap(domain);
1772         }
1773
1774         return 0;
1775 }
1776
1777 static int domain_detach_iommu(struct dmar_domain *domain,
1778                                struct intel_iommu *iommu)
1779 {
1780         int num, count = INT_MAX;
1781
1782         assert_spin_locked(&device_domain_lock);
1783         assert_spin_locked(&iommu->lock);
1784
1785         domain->iommu_refcnt[iommu->seq_id] -= 1;
1786         count = --domain->iommu_count;
1787         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1788                 num = domain->iommu_did[iommu->seq_id];
1789                 clear_bit(num, iommu->domain_ids);
1790                 set_iommu_domain(iommu, num, NULL);
1791
1792                 domain_update_iommu_cap(domain);
1793                 domain->iommu_did[iommu->seq_id] = 0;
1794         }
1795
1796         return count;
1797 }
1798
1799 static struct iova_domain reserved_iova_list;
1800 static struct lock_class_key reserved_rbtree_key;
1801
1802 static int dmar_init_reserved_ranges(void)
1803 {
1804         struct pci_dev *pdev = NULL;
1805         struct iova *iova;
1806         int i;
1807
1808         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1809                         DMA_32BIT_PFN);
1810
1811         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1812                 &reserved_rbtree_key);
1813
1814         /* IOAPIC ranges shouldn't be accessed by DMA */
1815         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1816                 IOVA_PFN(IOAPIC_RANGE_END));
1817         if (!iova) {
1818                 pr_err("Reserve IOAPIC range failed\n");
1819                 return -ENODEV;
1820         }
1821
1822         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1823         for_each_pci_dev(pdev) {
1824                 struct resource *r;
1825
1826                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1827                         r = &pdev->resource[i];
1828                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1829                                 continue;
1830                         iova = reserve_iova(&reserved_iova_list,
1831                                             IOVA_PFN(r->start),
1832                                             IOVA_PFN(r->end));
1833                         if (!iova) {
1834                                 pr_err("Reserve iova failed\n");
1835                                 return -ENODEV;
1836                         }
1837                 }
1838         }
1839         return 0;
1840 }
1841
1842 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1843 {
1844         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1845 }
1846
1847 static inline int guestwidth_to_adjustwidth(int gaw)
1848 {
1849         int agaw;
1850         int r = (gaw - 12) % 9;
1851
1852         if (r == 0)
1853                 agaw = gaw;
1854         else
1855                 agaw = gaw + 9 - r;
1856         if (agaw > 64)
1857                 agaw = 64;
1858         return agaw;
1859 }
1860
1861 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1862                        int guest_width)
1863 {
1864         int adjust_width, agaw;
1865         unsigned long sagaw;
1866
1867         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1868                         DMA_32BIT_PFN);
1869         domain_reserve_special_ranges(domain);
1870
1871         /* calculate AGAW */
1872         if (guest_width > cap_mgaw(iommu->cap))
1873                 guest_width = cap_mgaw(iommu->cap);
1874         domain->gaw = guest_width;
1875         adjust_width = guestwidth_to_adjustwidth(guest_width);
1876         agaw = width_to_agaw(adjust_width);
1877         sagaw = cap_sagaw(iommu->cap);
1878         if (!test_bit(agaw, &sagaw)) {
1879                 /* hardware doesn't support it, choose a bigger one */
1880                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1881                 agaw = find_next_bit(&sagaw, 5, agaw);
1882                 if (agaw >= 5)
1883                         return -ENODEV;
1884         }
1885         domain->agaw = agaw;
1886
1887         if (ecap_coherent(iommu->ecap))
1888                 domain->iommu_coherency = 1;
1889         else
1890                 domain->iommu_coherency = 0;
1891
1892         if (ecap_sc_support(iommu->ecap))
1893                 domain->iommu_snooping = 1;
1894         else
1895                 domain->iommu_snooping = 0;
1896
1897         if (intel_iommu_superpage)
1898                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1899         else
1900                 domain->iommu_superpage = 0;
1901
1902         domain->nid = iommu->node;
1903
1904         /* always allocate the top pgd */
1905         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1906         if (!domain->pgd)
1907                 return -ENOMEM;
1908         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1909         return 0;
1910 }
1911
1912 static void domain_exit(struct dmar_domain *domain)
1913 {
1914         struct page *freelist = NULL;
1915
1916         /* Domain 0 is reserved, so dont process it */
1917         if (!domain)
1918                 return;
1919
1920         /* Flush any lazy unmaps that may reference this domain */
1921         if (!intel_iommu_strict)
1922                 flush_unmaps_timeout(0);
1923
1924         /* Remove associated devices and clear attached or cached domains */
1925         rcu_read_lock();
1926         domain_remove_dev_info(domain);
1927         rcu_read_unlock();
1928
1929         /* destroy iovas */
1930         put_iova_domain(&domain->iovad);
1931
1932         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1933
1934         dma_free_pagelist(freelist);
1935
1936         free_domain_mem(domain);
1937 }
1938
1939 static int domain_context_mapping_one(struct dmar_domain *domain,
1940                                       struct intel_iommu *iommu,
1941                                       u8 bus, u8 devfn)
1942 {
1943         u16 did = domain->iommu_did[iommu->seq_id];
1944         int translation = CONTEXT_TT_MULTI_LEVEL;
1945         struct device_domain_info *info = NULL;
1946         struct context_entry *context;
1947         unsigned long flags;
1948         struct dma_pte *pgd;
1949         int ret, agaw;
1950
1951         WARN_ON(did == 0);
1952
1953         if (hw_pass_through && domain_type_is_si(domain))
1954                 translation = CONTEXT_TT_PASS_THROUGH;
1955
1956         pr_debug("Set context mapping for %02x:%02x.%d\n",
1957                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1958
1959         BUG_ON(!domain->pgd);
1960
1961         spin_lock_irqsave(&device_domain_lock, flags);
1962         spin_lock(&iommu->lock);
1963
1964         ret = -ENOMEM;
1965         context = iommu_context_addr(iommu, bus, devfn, 1);
1966         if (!context)
1967                 goto out_unlock;
1968
1969         ret = 0;
1970         if (context_present(context))
1971                 goto out_unlock;
1972
1973         pgd = domain->pgd;
1974
1975         context_clear_entry(context);
1976         context_set_domain_id(context, did);
1977
1978         /*
1979          * Skip top levels of page tables for iommu which has less agaw
1980          * than default.  Unnecessary for PT mode.
1981          */
1982         if (translation != CONTEXT_TT_PASS_THROUGH) {
1983                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1984                         ret = -ENOMEM;
1985                         pgd = phys_to_virt(dma_pte_addr(pgd));
1986                         if (!dma_pte_present(pgd))
1987                                 goto out_unlock;
1988                 }
1989
1990                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1991                 if (info && info->ats_supported)
1992                         translation = CONTEXT_TT_DEV_IOTLB;
1993                 else
1994                         translation = CONTEXT_TT_MULTI_LEVEL;
1995
1996                 context_set_address_root(context, virt_to_phys(pgd));
1997                 context_set_address_width(context, iommu->agaw);
1998         } else {
1999                 /*
2000                  * In pass through mode, AW must be programmed to
2001                  * indicate the largest AGAW value supported by
2002                  * hardware. And ASR is ignored by hardware.
2003                  */
2004                 context_set_address_width(context, iommu->msagaw);
2005         }
2006
2007         context_set_translation_type(context, translation);
2008         context_set_fault_enable(context);
2009         context_set_present(context);
2010         domain_flush_cache(domain, context, sizeof(*context));
2011
2012         /*
2013          * It's a non-present to present mapping. If hardware doesn't cache
2014          * non-present entry we only need to flush the write-buffer. If the
2015          * _does_ cache non-present entries, then it does so in the special
2016          * domain #0, which we have to flush:
2017          */
2018         if (cap_caching_mode(iommu->cap)) {
2019                 iommu->flush.flush_context(iommu, 0,
2020                                            (((u16)bus) << 8) | devfn,
2021                                            DMA_CCMD_MASK_NOBIT,
2022                                            DMA_CCMD_DEVICE_INVL);
2023                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2024         } else {
2025                 iommu_flush_write_buffer(iommu);
2026         }
2027         iommu_enable_dev_iotlb(info);
2028
2029         ret = 0;
2030
2031 out_unlock:
2032         spin_unlock(&iommu->lock);
2033         spin_unlock_irqrestore(&device_domain_lock, flags);
2034
2035         return 0;
2036 }
2037
2038 struct domain_context_mapping_data {
2039         struct dmar_domain *domain;
2040         struct intel_iommu *iommu;
2041 };
2042
2043 static int domain_context_mapping_cb(struct pci_dev *pdev,
2044                                      u16 alias, void *opaque)
2045 {
2046         struct domain_context_mapping_data *data = opaque;
2047
2048         return domain_context_mapping_one(data->domain, data->iommu,
2049                                           PCI_BUS_NUM(alias), alias & 0xff);
2050 }
2051
2052 static int
2053 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2054 {
2055         struct intel_iommu *iommu;
2056         u8 bus, devfn;
2057         struct domain_context_mapping_data data;
2058
2059         iommu = device_to_iommu(dev, &bus, &devfn);
2060         if (!iommu)
2061                 return -ENODEV;
2062
2063         if (!dev_is_pci(dev))
2064                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2065
2066         data.domain = domain;
2067         data.iommu = iommu;
2068
2069         return pci_for_each_dma_alias(to_pci_dev(dev),
2070                                       &domain_context_mapping_cb, &data);
2071 }
2072
2073 static int domain_context_mapped_cb(struct pci_dev *pdev,
2074                                     u16 alias, void *opaque)
2075 {
2076         struct intel_iommu *iommu = opaque;
2077
2078         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2079 }
2080
2081 static int domain_context_mapped(struct device *dev)
2082 {
2083         struct intel_iommu *iommu;
2084         u8 bus, devfn;
2085
2086         iommu = device_to_iommu(dev, &bus, &devfn);
2087         if (!iommu)
2088                 return -ENODEV;
2089
2090         if (!dev_is_pci(dev))
2091                 return device_context_mapped(iommu, bus, devfn);
2092
2093         return !pci_for_each_dma_alias(to_pci_dev(dev),
2094                                        domain_context_mapped_cb, iommu);
2095 }
2096
2097 /* Returns a number of VTD pages, but aligned to MM page size */
2098 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2099                                             size_t size)
2100 {
2101         host_addr &= ~PAGE_MASK;
2102         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2103 }
2104
2105 /* Return largest possible superpage level for a given mapping */
2106 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2107                                           unsigned long iov_pfn,
2108                                           unsigned long phy_pfn,
2109                                           unsigned long pages)
2110 {
2111         int support, level = 1;
2112         unsigned long pfnmerge;
2113
2114         support = domain->iommu_superpage;
2115
2116         /* To use a large page, the virtual *and* physical addresses
2117            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2118            of them will mean we have to use smaller pages. So just
2119            merge them and check both at once. */
2120         pfnmerge = iov_pfn | phy_pfn;
2121
2122         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2123                 pages >>= VTD_STRIDE_SHIFT;
2124                 if (!pages)
2125                         break;
2126                 pfnmerge >>= VTD_STRIDE_SHIFT;
2127                 level++;
2128                 support--;
2129         }
2130         return level;
2131 }
2132
2133 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2134                             struct scatterlist *sg, unsigned long phys_pfn,
2135                             unsigned long nr_pages, int prot)
2136 {
2137         struct dma_pte *first_pte = NULL, *pte = NULL;
2138         phys_addr_t uninitialized_var(pteval);
2139         unsigned long sg_res = 0;
2140         unsigned int largepage_lvl = 0;
2141         unsigned long lvl_pages = 0;
2142
2143         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2144
2145         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2146                 return -EINVAL;
2147
2148         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2149
2150         if (!sg) {
2151                 sg_res = nr_pages;
2152                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2153         }
2154
2155         while (nr_pages > 0) {
2156                 uint64_t tmp;
2157
2158                 if (!sg_res) {
2159                         sg_res = aligned_nrpages(sg->offset, sg->length);
2160                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2161                         sg->dma_length = sg->length;
2162                         pteval = (sg_phys(sg) & PAGE_MASK) | prot;
2163                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2164                 }
2165
2166                 if (!pte) {
2167                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2168
2169                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2170                         if (!pte)
2171                                 return -ENOMEM;
2172                         /* It is large page*/
2173                         if (largepage_lvl > 1) {
2174                                 unsigned long nr_superpages, end_pfn;
2175
2176                                 pteval |= DMA_PTE_LARGE_PAGE;
2177                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2178
2179                                 nr_superpages = sg_res / lvl_pages;
2180                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2181
2182                                 /*
2183                                  * Ensure that old small page tables are
2184                                  * removed to make room for superpage(s).
2185                                  */
2186                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2187                         } else {
2188                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2189                         }
2190
2191                 }
2192                 /* We don't need lock here, nobody else
2193                  * touches the iova range
2194                  */
2195                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2196                 if (tmp) {
2197                         static int dumps = 5;
2198                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2199                                 iov_pfn, tmp, (unsigned long long)pteval);
2200                         if (dumps) {
2201                                 dumps--;
2202                                 debug_dma_dump_mappings(NULL);
2203                         }
2204                         WARN_ON(1);
2205                 }
2206
2207                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2208
2209                 BUG_ON(nr_pages < lvl_pages);
2210                 BUG_ON(sg_res < lvl_pages);
2211
2212                 nr_pages -= lvl_pages;
2213                 iov_pfn += lvl_pages;
2214                 phys_pfn += lvl_pages;
2215                 pteval += lvl_pages * VTD_PAGE_SIZE;
2216                 sg_res -= lvl_pages;
2217
2218                 /* If the next PTE would be the first in a new page, then we
2219                    need to flush the cache on the entries we've just written.
2220                    And then we'll need to recalculate 'pte', so clear it and
2221                    let it get set again in the if (!pte) block above.
2222
2223                    If we're done (!nr_pages) we need to flush the cache too.
2224
2225                    Also if we've been setting superpages, we may need to
2226                    recalculate 'pte' and switch back to smaller pages for the
2227                    end of the mapping, if the trailing size is not enough to
2228                    use another superpage (i.e. sg_res < lvl_pages). */
2229                 pte++;
2230                 if (!nr_pages || first_pte_in_page(pte) ||
2231                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2232                         domain_flush_cache(domain, first_pte,
2233                                            (void *)pte - (void *)first_pte);
2234                         pte = NULL;
2235                 }
2236
2237                 if (!sg_res && nr_pages)
2238                         sg = sg_next(sg);
2239         }
2240         return 0;
2241 }
2242
2243 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2244                                     struct scatterlist *sg, unsigned long nr_pages,
2245                                     int prot)
2246 {
2247         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2248 }
2249
2250 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2251                                      unsigned long phys_pfn, unsigned long nr_pages,
2252                                      int prot)
2253 {
2254         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2255 }
2256
2257 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2258 {
2259         if (!iommu)
2260                 return;
2261
2262         clear_context_table(iommu, bus, devfn);
2263         iommu->flush.flush_context(iommu, 0, 0, 0,
2264                                            DMA_CCMD_GLOBAL_INVL);
2265         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2266 }
2267
2268 static inline void unlink_domain_info(struct device_domain_info *info)
2269 {
2270         assert_spin_locked(&device_domain_lock);
2271         list_del(&info->link);
2272         list_del(&info->global);
2273         if (info->dev)
2274                 info->dev->archdata.iommu = NULL;
2275 }
2276
2277 static void domain_remove_dev_info(struct dmar_domain *domain)
2278 {
2279         struct device_domain_info *info, *tmp;
2280         unsigned long flags;
2281
2282         spin_lock_irqsave(&device_domain_lock, flags);
2283         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2284                 __dmar_remove_one_dev_info(info);
2285         spin_unlock_irqrestore(&device_domain_lock, flags);
2286 }
2287
2288 /*
2289  * find_domain
2290  * Note: we use struct device->archdata.iommu stores the info
2291  */
2292 static struct dmar_domain *find_domain(struct device *dev)
2293 {
2294         struct device_domain_info *info;
2295
2296         /* No lock here, assumes no domain exit in normal case */
2297         info = dev->archdata.iommu;
2298         if (info)
2299                 return info->domain;
2300         return NULL;
2301 }
2302
2303 static inline struct device_domain_info *
2304 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2305 {
2306         struct device_domain_info *info;
2307
2308         list_for_each_entry(info, &device_domain_list, global)
2309                 if (info->iommu->segment == segment && info->bus == bus &&
2310                     info->devfn == devfn)
2311                         return info;
2312
2313         return NULL;
2314 }
2315
2316 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2317                                                     int bus, int devfn,
2318                                                     struct device *dev,
2319                                                     struct dmar_domain *domain)
2320 {
2321         struct dmar_domain *found = NULL;
2322         struct device_domain_info *info;
2323         unsigned long flags;
2324         int ret;
2325
2326         info = alloc_devinfo_mem();
2327         if (!info)
2328                 return NULL;
2329
2330         info->bus = bus;
2331         info->devfn = devfn;
2332         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2333         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2334         info->ats_qdep = 0;
2335         info->dev = dev;
2336         info->domain = domain;
2337         info->iommu = iommu;
2338
2339         if (dev && dev_is_pci(dev)) {
2340                 struct pci_dev *pdev = to_pci_dev(info->dev);
2341
2342                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2343                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2344                     dmar_find_matched_atsr_unit(pdev))
2345                         info->ats_supported = 1;
2346
2347                 if (ecs_enabled(iommu)) {
2348                         if (pasid_enabled(iommu)) {
2349                                 int features = pci_pasid_features(pdev);
2350                                 if (features >= 0)
2351                                         info->pasid_supported = features | 1;
2352                         }
2353
2354                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2355                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2356                                 info->pri_supported = 1;
2357                 }
2358         }
2359
2360         spin_lock_irqsave(&device_domain_lock, flags);
2361         if (dev)
2362                 found = find_domain(dev);
2363
2364         if (!found) {
2365                 struct device_domain_info *info2;
2366                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2367                 if (info2) {
2368                         found      = info2->domain;
2369                         info2->dev = dev;
2370                 }
2371         }
2372
2373         if (found) {
2374                 spin_unlock_irqrestore(&device_domain_lock, flags);
2375                 free_devinfo_mem(info);
2376                 /* Caller must free the original domain */
2377                 return found;
2378         }
2379
2380         spin_lock(&iommu->lock);
2381         ret = domain_attach_iommu(domain, iommu);
2382         spin_unlock(&iommu->lock);
2383
2384         if (ret) {
2385                 spin_unlock_irqrestore(&device_domain_lock, flags);
2386                 free_devinfo_mem(info);
2387                 return NULL;
2388         }
2389
2390         list_add(&info->link, &domain->devices);
2391         list_add(&info->global, &device_domain_list);
2392         if (dev)
2393                 dev->archdata.iommu = info;
2394         spin_unlock_irqrestore(&device_domain_lock, flags);
2395
2396         if (dev && domain_context_mapping(domain, dev)) {
2397                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2398                 dmar_remove_one_dev_info(domain, dev);
2399                 return NULL;
2400         }
2401
2402         return domain;
2403 }
2404
2405 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2406 {
2407         *(u16 *)opaque = alias;
2408         return 0;
2409 }
2410
2411 /* domain is initialized */
2412 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2413 {
2414         struct device_domain_info *info = NULL;
2415         struct dmar_domain *domain, *tmp;
2416         struct intel_iommu *iommu;
2417         u16 req_id, dma_alias;
2418         unsigned long flags;
2419         u8 bus, devfn;
2420
2421         domain = find_domain(dev);
2422         if (domain)
2423                 return domain;
2424
2425         iommu = device_to_iommu(dev, &bus, &devfn);
2426         if (!iommu)
2427                 return NULL;
2428
2429         req_id = ((u16)bus << 8) | devfn;
2430
2431         if (dev_is_pci(dev)) {
2432                 struct pci_dev *pdev = to_pci_dev(dev);
2433
2434                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2435
2436                 spin_lock_irqsave(&device_domain_lock, flags);
2437                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2438                                                       PCI_BUS_NUM(dma_alias),
2439                                                       dma_alias & 0xff);
2440                 if (info) {
2441                         iommu = info->iommu;
2442                         domain = info->domain;
2443                 }
2444                 spin_unlock_irqrestore(&device_domain_lock, flags);
2445
2446                 /* DMA alias already has a domain, uses it */
2447                 if (info)
2448                         goto found_domain;
2449         }
2450
2451         /* Allocate and initialize new domain for the device */
2452         domain = alloc_domain(0);
2453         if (!domain)
2454                 return NULL;
2455         if (domain_init(domain, iommu, gaw)) {
2456                 domain_exit(domain);
2457                 return NULL;
2458         }
2459
2460         /* register PCI DMA alias device */
2461         if (req_id != dma_alias && dev_is_pci(dev)) {
2462                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2463                                                dma_alias & 0xff, NULL, domain);
2464
2465                 if (!tmp || tmp != domain) {
2466                         domain_exit(domain);
2467                         domain = tmp;
2468                 }
2469
2470                 if (!domain)
2471                         return NULL;
2472         }
2473
2474 found_domain:
2475         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2476
2477         if (!tmp || tmp != domain) {
2478                 domain_exit(domain);
2479                 domain = tmp;
2480         }
2481
2482         return domain;
2483 }
2484
2485 static int iommu_domain_identity_map(struct dmar_domain *domain,
2486                                      unsigned long long start,
2487                                      unsigned long long end)
2488 {
2489         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2490         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2491
2492         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2493                           dma_to_mm_pfn(last_vpfn))) {
2494                 pr_err("Reserving iova failed\n");
2495                 return -ENOMEM;
2496         }
2497
2498         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2499         /*
2500          * RMRR range might have overlap with physical memory range,
2501          * clear it first
2502          */
2503         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2504
2505         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2506                                   last_vpfn - first_vpfn + 1,
2507                                   DMA_PTE_READ|DMA_PTE_WRITE);
2508 }
2509
2510 static int domain_prepare_identity_map(struct device *dev,
2511                                        struct dmar_domain *domain,
2512                                        unsigned long long start,
2513                                        unsigned long long end)
2514 {
2515         /* For _hardware_ passthrough, don't bother. But for software
2516            passthrough, we do it anyway -- it may indicate a memory
2517            range which is reserved in E820, so which didn't get set
2518            up to start with in si_domain */
2519         if (domain == si_domain && hw_pass_through) {
2520                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2521                         dev_name(dev), start, end);
2522                 return 0;
2523         }
2524
2525         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2526                 dev_name(dev), start, end);
2527
2528         if (end < start) {
2529                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2530                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2531                         dmi_get_system_info(DMI_BIOS_VENDOR),
2532                         dmi_get_system_info(DMI_BIOS_VERSION),
2533                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2534                 return -EIO;
2535         }
2536
2537         if (end >> agaw_to_width(domain->agaw)) {
2538                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2539                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2540                      agaw_to_width(domain->agaw),
2541                      dmi_get_system_info(DMI_BIOS_VENDOR),
2542                      dmi_get_system_info(DMI_BIOS_VERSION),
2543                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2544                 return -EIO;
2545         }
2546
2547         return iommu_domain_identity_map(domain, start, end);
2548 }
2549
2550 static int iommu_prepare_identity_map(struct device *dev,
2551                                       unsigned long long start,
2552                                       unsigned long long end)
2553 {
2554         struct dmar_domain *domain;
2555         int ret;
2556
2557         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2558         if (!domain)
2559                 return -ENOMEM;
2560
2561         ret = domain_prepare_identity_map(dev, domain, start, end);
2562         if (ret)
2563                 domain_exit(domain);
2564
2565         return ret;
2566 }
2567
2568 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2569                                          struct device *dev)
2570 {
2571         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2572                 return 0;
2573         return iommu_prepare_identity_map(dev, rmrr->base_address,
2574                                           rmrr->end_address);
2575 }
2576
2577 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2578 static inline void iommu_prepare_isa(void)
2579 {
2580         struct pci_dev *pdev;
2581         int ret;
2582
2583         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2584         if (!pdev)
2585                 return;
2586
2587         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2588         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2589
2590         if (ret)
2591                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2592
2593         pci_dev_put(pdev);
2594 }
2595 #else
2596 static inline void iommu_prepare_isa(void)
2597 {
2598         return;
2599 }
2600 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2601
2602 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2603
2604 static int __init si_domain_init(int hw)
2605 {
2606         int nid, ret = 0;
2607
2608         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2609         if (!si_domain)
2610                 return -EFAULT;
2611
2612         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2613                 domain_exit(si_domain);
2614                 return -EFAULT;
2615         }
2616
2617         pr_debug("Identity mapping domain allocated\n");
2618
2619         if (hw)
2620                 return 0;
2621
2622         for_each_online_node(nid) {
2623                 unsigned long start_pfn, end_pfn;
2624                 int i;
2625
2626                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2627                         ret = iommu_domain_identity_map(si_domain,
2628                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2629                         if (ret)
2630                                 return ret;
2631                 }
2632         }
2633
2634         return 0;
2635 }
2636
2637 static int identity_mapping(struct device *dev)
2638 {
2639         struct device_domain_info *info;
2640
2641         if (likely(!iommu_identity_mapping))
2642                 return 0;
2643
2644         info = dev->archdata.iommu;
2645         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2646                 return (info->domain == si_domain);
2647
2648         return 0;
2649 }
2650
2651 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2652 {
2653         struct dmar_domain *ndomain;
2654         struct intel_iommu *iommu;
2655         u8 bus, devfn;
2656
2657         iommu = device_to_iommu(dev, &bus, &devfn);
2658         if (!iommu)
2659                 return -ENODEV;
2660
2661         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662         if (ndomain != domain)
2663                 return -EBUSY;
2664
2665         return 0;
2666 }
2667
2668 static bool device_has_rmrr(struct device *dev)
2669 {
2670         struct dmar_rmrr_unit *rmrr;
2671         struct device *tmp;
2672         int i;
2673
2674         rcu_read_lock();
2675         for_each_rmrr_units(rmrr) {
2676                 /*
2677                  * Return TRUE if this RMRR contains the device that
2678                  * is passed in.
2679                  */
2680                 for_each_active_dev_scope(rmrr->devices,
2681                                           rmrr->devices_cnt, i, tmp)
2682                         if (tmp == dev) {
2683                                 rcu_read_unlock();
2684                                 return true;
2685                         }
2686         }
2687         rcu_read_unlock();
2688         return false;
2689 }
2690
2691 /*
2692  * There are a couple cases where we need to restrict the functionality of
2693  * devices associated with RMRRs.  The first is when evaluating a device for
2694  * identity mapping because problems exist when devices are moved in and out
2695  * of domains and their respective RMRR information is lost.  This means that
2696  * a device with associated RMRRs will never be in a "passthrough" domain.
2697  * The second is use of the device through the IOMMU API.  This interface
2698  * expects to have full control of the IOVA space for the device.  We cannot
2699  * satisfy both the requirement that RMRR access is maintained and have an
2700  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2701  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2702  * We therefore prevent devices associated with an RMRR from participating in
2703  * the IOMMU API, which eliminates them from device assignment.
2704  *
2705  * In both cases we assume that PCI USB devices with RMRRs have them largely
2706  * for historical reasons and that the RMRR space is not actively used post
2707  * boot.  This exclusion may change if vendors begin to abuse it.
2708  *
2709  * The same exception is made for graphics devices, with the requirement that
2710  * any use of the RMRR regions will be torn down before assigning the device
2711  * to a guest.
2712  */
2713 static bool device_is_rmrr_locked(struct device *dev)
2714 {
2715         if (!device_has_rmrr(dev))
2716                 return false;
2717
2718         if (dev_is_pci(dev)) {
2719                 struct pci_dev *pdev = to_pci_dev(dev);
2720
2721                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2722                         return false;
2723         }
2724
2725         return true;
2726 }
2727
2728 static int iommu_should_identity_map(struct device *dev, int startup)
2729 {
2730
2731         if (dev_is_pci(dev)) {
2732                 struct pci_dev *pdev = to_pci_dev(dev);
2733
2734                 if (device_is_rmrr_locked(dev))
2735                         return 0;
2736
2737                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2738                         return 1;
2739
2740                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2741                         return 1;
2742
2743                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2744                         return 0;
2745
2746                 /*
2747                  * We want to start off with all devices in the 1:1 domain, and
2748                  * take them out later if we find they can't access all of memory.
2749                  *
2750                  * However, we can't do this for PCI devices behind bridges,
2751                  * because all PCI devices behind the same bridge will end up
2752                  * with the same source-id on their transactions.
2753                  *
2754                  * Practically speaking, we can't change things around for these
2755                  * devices at run-time, because we can't be sure there'll be no
2756                  * DMA transactions in flight for any of their siblings.
2757                  *
2758                  * So PCI devices (unless they're on the root bus) as well as
2759                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2760                  * the 1:1 domain, just in _case_ one of their siblings turns out
2761                  * not to be able to map all of memory.
2762                  */
2763                 if (!pci_is_pcie(pdev)) {
2764                         if (!pci_is_root_bus(pdev->bus))
2765                                 return 0;
2766                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2767                                 return 0;
2768                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2769                         return 0;
2770         } else {
2771                 if (device_has_rmrr(dev))
2772                         return 0;
2773         }
2774
2775         /*
2776          * At boot time, we don't yet know if devices will be 64-bit capable.
2777          * Assume that they will — if they turn out not to be, then we can
2778          * take them out of the 1:1 domain later.
2779          */
2780         if (!startup) {
2781                 /*
2782                  * If the device's dma_mask is less than the system's memory
2783                  * size then this is not a candidate for identity mapping.
2784                  */
2785                 u64 dma_mask = *dev->dma_mask;
2786
2787                 if (dev->coherent_dma_mask &&
2788                     dev->coherent_dma_mask < dma_mask)
2789                         dma_mask = dev->coherent_dma_mask;
2790
2791                 return dma_mask >= dma_get_required_mask(dev);
2792         }
2793
2794         return 1;
2795 }
2796
2797 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2798 {
2799         int ret;
2800
2801         if (!iommu_should_identity_map(dev, 1))
2802                 return 0;
2803
2804         ret = domain_add_dev_info(si_domain, dev);
2805         if (!ret)
2806                 pr_info("%s identity mapping for device %s\n",
2807                         hw ? "Hardware" : "Software", dev_name(dev));
2808         else if (ret == -ENODEV)
2809                 /* device not associated with an iommu */
2810                 ret = 0;
2811
2812         return ret;
2813 }
2814
2815
2816 static int __init iommu_prepare_static_identity_mapping(int hw)
2817 {
2818         struct pci_dev *pdev = NULL;
2819         struct dmar_drhd_unit *drhd;
2820         struct intel_iommu *iommu;
2821         struct device *dev;
2822         int i;
2823         int ret = 0;
2824
2825         for_each_pci_dev(pdev) {
2826                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2827                 if (ret)
2828                         return ret;
2829         }
2830
2831         for_each_active_iommu(iommu, drhd)
2832                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2833                         struct acpi_device_physical_node *pn;
2834                         struct acpi_device *adev;
2835
2836                         if (dev->bus != &acpi_bus_type)
2837                                 continue;
2838
2839                         adev= to_acpi_device(dev);
2840                         mutex_lock(&adev->physical_node_lock);
2841                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2842                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2843                                 if (ret)
2844                                         break;
2845                         }
2846                         mutex_unlock(&adev->physical_node_lock);
2847                         if (ret)
2848                                 return ret;
2849                 }
2850
2851         return 0;
2852 }
2853
2854 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2855 {
2856         /*
2857          * Start from the sane iommu hardware state.
2858          * If the queued invalidation is already initialized by us
2859          * (for example, while enabling interrupt-remapping) then
2860          * we got the things already rolling from a sane state.
2861          */
2862         if (!iommu->qi) {
2863                 /*
2864                  * Clear any previous faults.
2865                  */
2866                 dmar_fault(-1, iommu);
2867                 /*
2868                  * Disable queued invalidation if supported and already enabled
2869                  * before OS handover.
2870                  */
2871                 dmar_disable_qi(iommu);
2872         }
2873
2874         if (dmar_enable_qi(iommu)) {
2875                 /*
2876                  * Queued Invalidate not enabled, use Register Based Invalidate
2877                  */
2878                 iommu->flush.flush_context = __iommu_flush_context;
2879                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2880                 pr_info("%s: Using Register based invalidation\n",
2881                         iommu->name);
2882         } else {
2883                 iommu->flush.flush_context = qi_flush_context;
2884                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2885                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2886         }
2887 }
2888
2889 static int copy_context_table(struct intel_iommu *iommu,
2890                               struct root_entry *old_re,
2891                               struct context_entry **tbl,
2892                               int bus, bool ext)
2893 {
2894         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2895         struct context_entry *new_ce = NULL, ce;
2896         struct context_entry *old_ce = NULL;
2897         struct root_entry re;
2898         phys_addr_t old_ce_phys;
2899
2900         tbl_idx = ext ? bus * 2 : bus;
2901         memcpy(&re, old_re, sizeof(re));
2902
2903         for (devfn = 0; devfn < 256; devfn++) {
2904                 /* First calculate the correct index */
2905                 idx = (ext ? devfn * 2 : devfn) % 256;
2906
2907                 if (idx == 0) {
2908                         /* First save what we may have and clean up */
2909                         if (new_ce) {
2910                                 tbl[tbl_idx] = new_ce;
2911                                 __iommu_flush_cache(iommu, new_ce,
2912                                                     VTD_PAGE_SIZE);
2913                                 pos = 1;
2914                         }
2915
2916                         if (old_ce)
2917                                 iounmap(old_ce);
2918
2919                         ret = 0;
2920                         if (devfn < 0x80)
2921                                 old_ce_phys = root_entry_lctp(&re);
2922                         else
2923                                 old_ce_phys = root_entry_uctp(&re);
2924
2925                         if (!old_ce_phys) {
2926                                 if (ext && devfn == 0) {
2927                                         /* No LCTP, try UCTP */
2928                                         devfn = 0x7f;
2929                                         continue;
2930                                 } else {
2931                                         goto out;
2932                                 }
2933                         }
2934
2935                         ret = -ENOMEM;
2936                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2937                                         MEMREMAP_WB);
2938                         if (!old_ce)
2939                                 goto out;
2940
2941                         new_ce = alloc_pgtable_page(iommu->node);
2942                         if (!new_ce)
2943                                 goto out_unmap;
2944
2945                         ret = 0;
2946                 }
2947
2948                 /* Now copy the context entry */
2949                 memcpy(&ce, old_ce + idx, sizeof(ce));
2950
2951                 if (!__context_present(&ce))
2952                         continue;
2953
2954                 did = context_domain_id(&ce);
2955                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2956                         set_bit(did, iommu->domain_ids);
2957
2958                 /*
2959                  * We need a marker for copied context entries. This
2960                  * marker needs to work for the old format as well as
2961                  * for extended context entries.
2962                  *
2963                  * Bit 67 of the context entry is used. In the old
2964                  * format this bit is available to software, in the
2965                  * extended format it is the PGE bit, but PGE is ignored
2966                  * by HW if PASIDs are disabled (and thus still
2967                  * available).
2968                  *
2969                  * So disable PASIDs first and then mark the entry
2970                  * copied. This means that we don't copy PASID
2971                  * translations from the old kernel, but this is fine as
2972                  * faults there are not fatal.
2973                  */
2974                 context_clear_pasid_enable(&ce);
2975                 context_set_copied(&ce);
2976
2977                 new_ce[idx] = ce;
2978         }
2979
2980         tbl[tbl_idx + pos] = new_ce;
2981
2982         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2983
2984 out_unmap:
2985         memunmap(old_ce);
2986
2987 out:
2988         return ret;
2989 }
2990
2991 static int copy_translation_tables(struct intel_iommu *iommu)
2992 {
2993         struct context_entry **ctxt_tbls;
2994         struct root_entry *old_rt;
2995         phys_addr_t old_rt_phys;
2996         int ctxt_table_entries;
2997         unsigned long flags;
2998         u64 rtaddr_reg;
2999         int bus, ret;
3000         bool new_ext, ext;
3001
3002         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3003         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3004         new_ext    = !!ecap_ecs(iommu->ecap);
3005
3006         /*
3007          * The RTT bit can only be changed when translation is disabled,
3008          * but disabling translation means to open a window for data
3009          * corruption. So bail out and don't copy anything if we would
3010          * have to change the bit.
3011          */
3012         if (new_ext != ext)
3013                 return -EINVAL;
3014
3015         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3016         if (!old_rt_phys)
3017                 return -EINVAL;
3018
3019         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3020         if (!old_rt)
3021                 return -ENOMEM;
3022
3023         /* This is too big for the stack - allocate it from slab */
3024         ctxt_table_entries = ext ? 512 : 256;
3025         ret = -ENOMEM;
3026         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3027         if (!ctxt_tbls)
3028                 goto out_unmap;
3029
3030         for (bus = 0; bus < 256; bus++) {
3031                 ret = copy_context_table(iommu, &old_rt[bus],
3032                                          ctxt_tbls, bus, ext);
3033                 if (ret) {
3034                         pr_err("%s: Failed to copy context table for bus %d\n",
3035                                 iommu->name, bus);
3036                         continue;
3037                 }
3038         }
3039
3040         spin_lock_irqsave(&iommu->lock, flags);
3041
3042         /* Context tables are copied, now write them to the root_entry table */
3043         for (bus = 0; bus < 256; bus++) {
3044                 int idx = ext ? bus * 2 : bus;
3045                 u64 val;
3046
3047                 if (ctxt_tbls[idx]) {
3048                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3049                         iommu->root_entry[bus].lo = val;
3050                 }
3051
3052                 if (!ext || !ctxt_tbls[idx + 1])
3053                         continue;
3054
3055                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3056                 iommu->root_entry[bus].hi = val;
3057         }
3058
3059         spin_unlock_irqrestore(&iommu->lock, flags);
3060
3061         kfree(ctxt_tbls);
3062
3063         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3064
3065         ret = 0;
3066
3067 out_unmap:
3068         memunmap(old_rt);
3069
3070         return ret;
3071 }
3072
3073 static int __init init_dmars(void)
3074 {
3075         struct dmar_drhd_unit *drhd;
3076         struct dmar_rmrr_unit *rmrr;
3077         bool copied_tables = false;
3078         struct device *dev;
3079         struct intel_iommu *iommu;
3080         int i, ret;
3081
3082         /*
3083          * for each drhd
3084          *    allocate root
3085          *    initialize and program root entry to not present
3086          * endfor
3087          */
3088         for_each_drhd_unit(drhd) {
3089                 /*
3090                  * lock not needed as this is only incremented in the single
3091                  * threaded kernel __init code path all other access are read
3092                  * only
3093                  */
3094                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3095                         g_num_of_iommus++;
3096                         continue;
3097                 }
3098                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3099         }
3100
3101         /* Preallocate enough resources for IOMMU hot-addition */
3102         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3103                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3104
3105         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3106                         GFP_KERNEL);
3107         if (!g_iommus) {
3108                 pr_err("Allocating global iommu array failed\n");
3109                 ret = -ENOMEM;
3110                 goto error;
3111         }
3112
3113         deferred_flush = kzalloc(g_num_of_iommus *
3114                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3115         if (!deferred_flush) {
3116                 ret = -ENOMEM;
3117                 goto free_g_iommus;
3118         }
3119
3120         for_each_active_iommu(iommu, drhd) {
3121                 g_iommus[iommu->seq_id] = iommu;
3122
3123                 intel_iommu_init_qi(iommu);
3124
3125                 ret = iommu_init_domains(iommu);
3126                 if (ret)
3127                         goto free_iommu;
3128
3129                 init_translation_status(iommu);
3130
3131                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3132                         iommu_disable_translation(iommu);
3133                         clear_translation_pre_enabled(iommu);
3134                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3135                                 iommu->name);
3136                 }
3137
3138                 /*
3139                  * TBD:
3140                  * we could share the same root & context tables
3141                  * among all IOMMU's. Need to Split it later.
3142                  */
3143                 ret = iommu_alloc_root_entry(iommu);
3144                 if (ret)
3145                         goto free_iommu;
3146
3147                 if (translation_pre_enabled(iommu)) {
3148                         pr_info("Translation already enabled - trying to copy translation structures\n");
3149
3150                         ret = copy_translation_tables(iommu);
3151                         if (ret) {
3152                                 /*
3153                                  * We found the IOMMU with translation
3154                                  * enabled - but failed to copy over the
3155                                  * old root-entry table. Try to proceed
3156                                  * by disabling translation now and
3157                                  * allocating a clean root-entry table.
3158                                  * This might cause DMAR faults, but
3159                                  * probably the dump will still succeed.
3160                                  */
3161                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3162                                        iommu->name);
3163                                 iommu_disable_translation(iommu);
3164                                 clear_translation_pre_enabled(iommu);
3165                         } else {
3166                                 pr_info("Copied translation tables from previous kernel for %s\n",
3167                                         iommu->name);
3168                                 copied_tables = true;
3169                         }
3170                 }
3171
3172                 iommu_flush_write_buffer(iommu);
3173                 iommu_set_root_entry(iommu);
3174                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3175                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3176
3177                 if (!ecap_pass_through(iommu->ecap))
3178                         hw_pass_through = 0;
3179 #ifdef CONFIG_INTEL_IOMMU_SVM
3180                 if (pasid_enabled(iommu))
3181                         intel_svm_alloc_pasid_tables(iommu);
3182 #endif
3183         }
3184
3185         if (iommu_pass_through)
3186                 iommu_identity_mapping |= IDENTMAP_ALL;
3187
3188 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3189         iommu_identity_mapping |= IDENTMAP_GFX;
3190 #endif
3191
3192         if (iommu_identity_mapping) {
3193                 ret = si_domain_init(hw_pass_through);
3194                 if (ret)
3195                         goto free_iommu;
3196         }
3197
3198         check_tylersburg_isoch();
3199
3200         /*
3201          * If we copied translations from a previous kernel in the kdump
3202          * case, we can not assign the devices to domains now, as that
3203          * would eliminate the old mappings. So skip this part and defer
3204          * the assignment to device driver initialization time.
3205          */
3206         if (copied_tables)
3207                 goto domains_done;
3208
3209         /*
3210          * If pass through is not set or not enabled, setup context entries for
3211          * identity mappings for rmrr, gfx, and isa and may fall back to static
3212          * identity mapping if iommu_identity_mapping is set.
3213          */
3214         if (iommu_identity_mapping) {
3215                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3216                 if (ret) {
3217                         pr_crit("Failed to setup IOMMU pass-through\n");
3218                         goto free_iommu;
3219                 }
3220         }
3221         /*
3222          * For each rmrr
3223          *   for each dev attached to rmrr
3224          *   do
3225          *     locate drhd for dev, alloc domain for dev
3226          *     allocate free domain
3227          *     allocate page table entries for rmrr
3228          *     if context not allocated for bus
3229          *           allocate and init context
3230          *           set present in root table for this bus
3231          *     init context with domain, translation etc
3232          *    endfor
3233          * endfor
3234          */
3235         pr_info("Setting RMRR:\n");
3236         for_each_rmrr_units(rmrr) {
3237                 /* some BIOS lists non-exist devices in DMAR table. */
3238                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3239                                           i, dev) {
3240                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3241                         if (ret)
3242                                 pr_err("Mapping reserved region failed\n");
3243                 }
3244         }
3245
3246         iommu_prepare_isa();
3247
3248 domains_done:
3249
3250         /*
3251          * for each drhd
3252          *   enable fault log
3253          *   global invalidate context cache
3254          *   global invalidate iotlb
3255          *   enable translation
3256          */
3257         for_each_iommu(iommu, drhd) {
3258                 if (drhd->ignored) {
3259                         /*
3260                          * we always have to disable PMRs or DMA may fail on
3261                          * this device
3262                          */
3263                         if (force_on)
3264                                 iommu_disable_protect_mem_regions(iommu);
3265                         continue;
3266                 }
3267
3268                 iommu_flush_write_buffer(iommu);
3269
3270 #ifdef CONFIG_INTEL_IOMMU_SVM
3271                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3272                         ret = intel_svm_enable_prq(iommu);
3273                         if (ret)
3274                                 goto free_iommu;
3275                 }
3276 #endif
3277                 ret = dmar_set_interrupt(iommu);
3278                 if (ret)
3279                         goto free_iommu;
3280
3281                 if (!translation_pre_enabled(iommu))
3282                         iommu_enable_translation(iommu);
3283
3284                 iommu_disable_protect_mem_regions(iommu);
3285         }
3286
3287         return 0;
3288
3289 free_iommu:
3290         for_each_active_iommu(iommu, drhd) {
3291                 disable_dmar_iommu(iommu);
3292                 free_dmar_iommu(iommu);
3293         }
3294         kfree(deferred_flush);
3295 free_g_iommus:
3296         kfree(g_iommus);
3297 error:
3298         return ret;
3299 }
3300
3301 /* This takes a number of _MM_ pages, not VTD pages */
3302 static struct iova *intel_alloc_iova(struct device *dev,
3303                                      struct dmar_domain *domain,
3304                                      unsigned long nrpages, uint64_t dma_mask)
3305 {
3306         struct iova *iova = NULL;
3307
3308         /* Restrict dma_mask to the width that the iommu can handle */
3309         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3310         /* Ensure we reserve the whole size-aligned region */
3311         nrpages = __roundup_pow_of_two(nrpages);
3312
3313         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3314                 /*
3315                  * First try to allocate an io virtual address in
3316                  * DMA_BIT_MASK(32) and if that fails then try allocating
3317                  * from higher range
3318                  */
3319                 iova = alloc_iova(&domain->iovad, nrpages,
3320                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3321                 if (iova)
3322                         return iova;
3323         }
3324         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3325         if (unlikely(!iova)) {
3326                 pr_err("Allocating %ld-page iova for %s failed",
3327                        nrpages, dev_name(dev));
3328                 return NULL;
3329         }
3330
3331         return iova;
3332 }
3333
3334 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3335 {
3336         struct dmar_rmrr_unit *rmrr;
3337         struct dmar_domain *domain;
3338         struct device *i_dev;
3339         int i, ret;
3340
3341         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3342         if (!domain) {
3343                 pr_err("Allocating domain for %s failed\n",
3344                        dev_name(dev));
3345                 return NULL;
3346         }
3347
3348         /* We have a new domain - setup possible RMRRs for the device */
3349         rcu_read_lock();
3350         for_each_rmrr_units(rmrr) {
3351                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3352                                           i, i_dev) {
3353                         if (i_dev != dev)
3354                                 continue;
3355
3356                         ret = domain_prepare_identity_map(dev, domain,
3357                                                           rmrr->base_address,
3358                                                           rmrr->end_address);
3359                         if (ret)
3360                                 dev_err(dev, "Mapping reserved region failed\n");
3361                 }
3362         }
3363         rcu_read_unlock();
3364
3365         return domain;
3366 }
3367
3368 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3369 {
3370         struct device_domain_info *info;
3371
3372         /* No lock here, assumes no domain exit in normal case */
3373         info = dev->archdata.iommu;
3374         if (likely(info))
3375                 return info->domain;
3376
3377         return __get_valid_domain_for_dev(dev);
3378 }
3379
3380 /* Check if the dev needs to go through non-identity map and unmap process.*/
3381 static int iommu_no_mapping(struct device *dev)
3382 {
3383         int found;
3384
3385         if (iommu_dummy(dev))
3386                 return 1;
3387
3388         if (!iommu_identity_mapping)
3389                 return 0;
3390
3391         found = identity_mapping(dev);
3392         if (found) {
3393                 if (iommu_should_identity_map(dev, 0))
3394                         return 1;
3395                 else {
3396                         /*
3397                          * 32 bit DMA is removed from si_domain and fall back
3398                          * to non-identity mapping.
3399                          */
3400                         dmar_remove_one_dev_info(si_domain, dev);
3401                         pr_info("32bit %s uses non-identity mapping\n",
3402                                 dev_name(dev));
3403                         return 0;
3404                 }
3405         } else {
3406                 /*
3407                  * In case of a detached 64 bit DMA device from vm, the device
3408                  * is put into si_domain for identity mapping.
3409                  */
3410                 if (iommu_should_identity_map(dev, 0)) {
3411                         int ret;
3412                         ret = domain_add_dev_info(si_domain, dev);
3413                         if (!ret) {
3414                                 pr_info("64bit %s uses identity mapping\n",
3415                                         dev_name(dev));
3416                                 return 1;
3417                         }
3418                 }
3419         }
3420
3421         return 0;
3422 }
3423
3424 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3425                                      size_t size, int dir, u64 dma_mask)
3426 {
3427         struct dmar_domain *domain;
3428         phys_addr_t start_paddr;
3429         struct iova *iova;
3430         int prot = 0;
3431         int ret;
3432         struct intel_iommu *iommu;
3433         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3434
3435         BUG_ON(dir == DMA_NONE);
3436
3437         if (iommu_no_mapping(dev))
3438                 return paddr;
3439
3440         domain = get_valid_domain_for_dev(dev);
3441         if (!domain)
3442                 return 0;
3443
3444         iommu = domain_get_iommu(domain);
3445         size = aligned_nrpages(paddr, size);
3446
3447         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3448         if (!iova)
3449                 goto error;
3450
3451         /*
3452          * Check if DMAR supports zero-length reads on write only
3453          * mappings..
3454          */
3455         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3456                         !cap_zlr(iommu->cap))
3457                 prot |= DMA_PTE_READ;
3458         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3459                 prot |= DMA_PTE_WRITE;
3460         /*
3461          * paddr - (paddr + size) might be partial page, we should map the whole
3462          * page.  Note: if two part of one page are separately mapped, we
3463          * might have two guest_addr mapping to the same host paddr, but this
3464          * is not a big problem
3465          */
3466         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3467                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3468         if (ret)
3469                 goto error;
3470
3471         /* it's a non-present to present mapping. Only flush if caching mode */
3472         if (cap_caching_mode(iommu->cap))
3473                 iommu_flush_iotlb_psi(iommu, domain,
3474                                       mm_to_dma_pfn(iova->pfn_lo),
3475                                       size, 0, 1);
3476         else
3477                 iommu_flush_write_buffer(iommu);
3478
3479         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3480         start_paddr += paddr & ~PAGE_MASK;
3481         return start_paddr;
3482
3483 error:
3484         if (iova)
3485                 __free_iova(&domain->iovad, iova);
3486         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3487                 dev_name(dev), size, (unsigned long long)paddr, dir);
3488         return 0;
3489 }
3490
3491 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3492                                  unsigned long offset, size_t size,
3493                                  enum dma_data_direction dir,
3494                                  struct dma_attrs *attrs)
3495 {
3496         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3497                                   dir, *dev->dma_mask);
3498 }
3499
3500 static void flush_unmaps(void)
3501 {
3502         int i, j;
3503
3504         timer_on = 0;
3505
3506         /* just flush them all */
3507         for (i = 0; i < g_num_of_iommus; i++) {
3508                 struct intel_iommu *iommu = g_iommus[i];
3509                 if (!iommu)
3510                         continue;
3511
3512                 if (!deferred_flush[i].next)
3513                         continue;
3514
3515                 /* In caching mode, global flushes turn emulation expensive */
3516                 if (!cap_caching_mode(iommu->cap))
3517                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3518                                          DMA_TLB_GLOBAL_FLUSH);
3519                 for (j = 0; j < deferred_flush[i].next; j++) {
3520                         unsigned long mask;
3521                         struct iova *iova = deferred_flush[i].iova[j];
3522                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3523
3524                         /* On real hardware multiple invalidations are expensive */
3525                         if (cap_caching_mode(iommu->cap))
3526                                 iommu_flush_iotlb_psi(iommu, domain,
3527                                         iova->pfn_lo, iova_size(iova),
3528                                         !deferred_flush[i].freelist[j], 0);
3529                         else {
3530                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3531                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3532                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3533                         }
3534                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3535                         if (deferred_flush[i].freelist[j])
3536                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3537                 }
3538                 deferred_flush[i].next = 0;
3539         }
3540
3541         list_size = 0;
3542 }
3543
3544 static void flush_unmaps_timeout(unsigned long data)
3545 {
3546         unsigned long flags;
3547
3548         spin_lock_irqsave(&async_umap_flush_lock, flags);
3549         flush_unmaps();
3550         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3551 }
3552
3553 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3554 {
3555         unsigned long flags;
3556         int next, iommu_id;
3557         struct intel_iommu *iommu;
3558
3559         spin_lock_irqsave(&async_umap_flush_lock, flags);
3560         if (list_size == HIGH_WATER_MARK)
3561                 flush_unmaps();
3562
3563         iommu = domain_get_iommu(dom);
3564         iommu_id = iommu->seq_id;
3565
3566         next = deferred_flush[iommu_id].next;
3567         deferred_flush[iommu_id].domain[next] = dom;
3568         deferred_flush[iommu_id].iova[next] = iova;
3569         deferred_flush[iommu_id].freelist[next] = freelist;
3570         deferred_flush[iommu_id].next++;
3571
3572         if (!timer_on) {
3573                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3574                 timer_on = 1;
3575         }
3576         list_size++;
3577         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3578 }
3579
3580 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3581 {
3582         struct dmar_domain *domain;
3583         unsigned long start_pfn, last_pfn;
3584         struct iova *iova;
3585         struct intel_iommu *iommu;
3586         struct page *freelist;
3587
3588         if (iommu_no_mapping(dev))
3589                 return;
3590
3591         domain = find_domain(dev);
3592         BUG_ON(!domain);
3593
3594         iommu = domain_get_iommu(domain);
3595
3596         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3597         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3598                       (unsigned long long)dev_addr))
3599                 return;
3600
3601         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3602         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3603
3604         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3605                  dev_name(dev), start_pfn, last_pfn);
3606
3607         freelist = domain_unmap(domain, start_pfn, last_pfn);
3608
3609         if (intel_iommu_strict) {
3610                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3611                                       last_pfn - start_pfn + 1, !freelist, 0);
3612                 /* free iova */
3613                 __free_iova(&domain->iovad, iova);
3614                 dma_free_pagelist(freelist);
3615         } else {
3616                 add_unmap(domain, iova, freelist);
3617                 /*
3618                  * queue up the release of the unmap to save the 1/6th of the
3619                  * cpu used up by the iotlb flush operation...
3620                  */
3621         }
3622 }
3623
3624 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3625                              size_t size, enum dma_data_direction dir,
3626                              struct dma_attrs *attrs)
3627 {
3628         intel_unmap(dev, dev_addr);
3629 }
3630
3631 static void *intel_alloc_coherent(struct device *dev, size_t size,
3632                                   dma_addr_t *dma_handle, gfp_t flags,
3633                                   struct dma_attrs *attrs)
3634 {
3635         struct page *page = NULL;
3636         int order;
3637
3638         size = PAGE_ALIGN(size);
3639         order = get_order(size);
3640
3641         if (!iommu_no_mapping(dev))
3642                 flags &= ~(GFP_DMA | GFP_DMA32);
3643         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3644                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3645                         flags |= GFP_DMA;
3646                 else
3647                         flags |= GFP_DMA32;
3648         }
3649
3650         if (flags & __GFP_WAIT) {
3651                 unsigned int count = size >> PAGE_SHIFT;
3652
3653                 page = dma_alloc_from_contiguous(dev, count, order);
3654                 if (page && iommu_no_mapping(dev) &&
3655                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3656                         dma_release_from_contiguous(dev, page, count);
3657                         page = NULL;
3658                 }
3659         }
3660
3661         if (!page)
3662                 page = alloc_pages(flags, order);
3663         if (!page)
3664                 return NULL;
3665         memset(page_address(page), 0, size);
3666
3667         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3668                                          DMA_BIDIRECTIONAL,
3669                                          dev->coherent_dma_mask);
3670         if (*dma_handle)
3671                 return page_address(page);
3672         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3673                 __free_pages(page, order);
3674
3675         return NULL;
3676 }
3677
3678 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3679                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3680 {
3681         int order;
3682         struct page *page = virt_to_page(vaddr);
3683
3684         size = PAGE_ALIGN(size);
3685         order = get_order(size);
3686
3687         intel_unmap(dev, dma_handle);
3688         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3689                 __free_pages(page, order);
3690 }
3691
3692 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3693                            int nelems, enum dma_data_direction dir,
3694                            struct dma_attrs *attrs)
3695 {
3696         intel_unmap(dev, sglist[0].dma_address);
3697 }
3698
3699 static int intel_nontranslate_map_sg(struct device *hddev,
3700         struct scatterlist *sglist, int nelems, int dir)
3701 {
3702         int i;
3703         struct scatterlist *sg;
3704
3705         for_each_sg(sglist, sg, nelems, i) {
3706                 BUG_ON(!sg_page(sg));
3707                 sg->dma_address = sg_phys(sg);
3708                 sg->dma_length = sg->length;
3709         }
3710         return nelems;
3711 }
3712
3713 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3714                         enum dma_data_direction dir, struct dma_attrs *attrs)
3715 {
3716         int i;
3717         struct dmar_domain *domain;
3718         size_t size = 0;
3719         int prot = 0;
3720         struct iova *iova = NULL;
3721         int ret;
3722         struct scatterlist *sg;
3723         unsigned long start_vpfn;
3724         struct intel_iommu *iommu;
3725
3726         BUG_ON(dir == DMA_NONE);
3727         if (iommu_no_mapping(dev))
3728                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3729
3730         domain = get_valid_domain_for_dev(dev);
3731         if (!domain)
3732                 return 0;
3733
3734         iommu = domain_get_iommu(domain);
3735
3736         for_each_sg(sglist, sg, nelems, i)
3737                 size += aligned_nrpages(sg->offset, sg->length);
3738
3739         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3740                                 *dev->dma_mask);
3741         if (!iova) {
3742                 sglist->dma_length = 0;
3743                 return 0;
3744         }
3745
3746         /*
3747          * Check if DMAR supports zero-length reads on write only
3748          * mappings..
3749          */
3750         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3751                         !cap_zlr(iommu->cap))
3752                 prot |= DMA_PTE_READ;
3753         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3754                 prot |= DMA_PTE_WRITE;
3755
3756         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3757
3758         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3759         if (unlikely(ret)) {
3760                 dma_pte_free_pagetable(domain, start_vpfn,
3761                                        start_vpfn + size - 1);
3762                 __free_iova(&domain->iovad, iova);
3763                 return 0;
3764         }
3765
3766         /* it's a non-present to present mapping. Only flush if caching mode */
3767         if (cap_caching_mode(iommu->cap))
3768                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3769         else
3770                 iommu_flush_write_buffer(iommu);
3771
3772         return nelems;
3773 }
3774
3775 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3776 {
3777         return !dma_addr;
3778 }
3779
3780 struct dma_map_ops intel_dma_ops = {
3781         .alloc = intel_alloc_coherent,
3782         .free = intel_free_coherent,
3783         .map_sg = intel_map_sg,
3784         .unmap_sg = intel_unmap_sg,
3785         .map_page = intel_map_page,
3786         .unmap_page = intel_unmap_page,
3787         .mapping_error = intel_mapping_error,
3788 };
3789
3790 static inline int iommu_domain_cache_init(void)
3791 {
3792         int ret = 0;
3793
3794         iommu_domain_cache = kmem_cache_create("iommu_domain",
3795                                          sizeof(struct dmar_domain),
3796                                          0,
3797                                          SLAB_HWCACHE_ALIGN,
3798
3799                                          NULL);
3800         if (!iommu_domain_cache) {
3801                 pr_err("Couldn't create iommu_domain cache\n");
3802                 ret = -ENOMEM;
3803         }
3804
3805         return ret;
3806 }
3807
3808 static inline int iommu_devinfo_cache_init(void)
3809 {
3810         int ret = 0;
3811
3812         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3813                                          sizeof(struct device_domain_info),
3814                                          0,
3815                                          SLAB_HWCACHE_ALIGN,
3816                                          NULL);
3817         if (!iommu_devinfo_cache) {
3818                 pr_err("Couldn't create devinfo cache\n");
3819                 ret = -ENOMEM;
3820         }
3821
3822         return ret;
3823 }
3824
3825 static int __init iommu_init_mempool(void)
3826 {
3827         int ret;
3828         ret = iova_cache_get();
3829         if (ret)
3830                 return ret;
3831
3832         ret = iommu_domain_cache_init();
3833         if (ret)
3834                 goto domain_error;
3835
3836         ret = iommu_devinfo_cache_init();
3837         if (!ret)
3838                 return ret;
3839
3840         kmem_cache_destroy(iommu_domain_cache);
3841 domain_error:
3842         iova_cache_put();
3843
3844         return -ENOMEM;
3845 }
3846
3847 static void __init iommu_exit_mempool(void)
3848 {
3849         kmem_cache_destroy(iommu_devinfo_cache);
3850         kmem_cache_destroy(iommu_domain_cache);
3851         iova_cache_put();
3852 }
3853
3854 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3855 {
3856         struct dmar_drhd_unit *drhd;
3857         u32 vtbar;
3858         int rc;
3859
3860         /* We know that this device on this chipset has its own IOMMU.
3861          * If we find it under a different IOMMU, then the BIOS is lying
3862          * to us. Hope that the IOMMU for this device is actually
3863          * disabled, and it needs no translation...
3864          */
3865         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3866         if (rc) {
3867                 /* "can't" happen */
3868                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3869                 return;
3870         }
3871         vtbar &= 0xffff0000;
3872
3873         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3874         drhd = dmar_find_matched_drhd_unit(pdev);
3875         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3876                             TAINT_FIRMWARE_WORKAROUND,
3877                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3878                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3879 }
3880 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3881
3882 static void __init init_no_remapping_devices(void)
3883 {
3884         struct dmar_drhd_unit *drhd;
3885         struct device *dev;
3886         int i;
3887
3888         for_each_drhd_unit(drhd) {
3889                 if (!drhd->include_all) {
3890                         for_each_active_dev_scope(drhd->devices,
3891                                                   drhd->devices_cnt, i, dev)
3892                                 break;
3893                         /* ignore DMAR unit if no devices exist */
3894                         if (i == drhd->devices_cnt)
3895                                 drhd->ignored = 1;
3896                 }
3897         }
3898
3899         for_each_active_drhd_unit(drhd) {
3900                 if (drhd->include_all)
3901                         continue;
3902
3903                 for_each_active_dev_scope(drhd->devices,
3904                                           drhd->devices_cnt, i, dev)
3905                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3906                                 break;
3907                 if (i < drhd->devices_cnt)
3908                         continue;
3909
3910                 /* This IOMMU has *only* gfx devices. Either bypass it or
3911                    set the gfx_mapped flag, as appropriate */
3912                 if (dmar_map_gfx) {
3913                         intel_iommu_gfx_mapped = 1;
3914                 } else {
3915                         drhd->ignored = 1;
3916                         for_each_active_dev_scope(drhd->devices,
3917                                                   drhd->devices_cnt, i, dev)
3918                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3919                 }
3920         }
3921 }
3922
3923 #ifdef CONFIG_SUSPEND
3924 static int init_iommu_hw(void)
3925 {
3926         struct dmar_drhd_unit *drhd;
3927         struct intel_iommu *iommu = NULL;
3928
3929         for_each_active_iommu(iommu, drhd)
3930                 if (iommu->qi)
3931                         dmar_reenable_qi(iommu);
3932
3933         for_each_iommu(iommu, drhd) {
3934                 if (drhd->ignored) {
3935                         /*
3936                          * we always have to disable PMRs or DMA may fail on
3937                          * this device
3938                          */
3939                         if (force_on)
3940                                 iommu_disable_protect_mem_regions(iommu);
3941                         continue;
3942                 }
3943         
3944                 iommu_flush_write_buffer(iommu);
3945
3946                 iommu_set_root_entry(iommu);
3947
3948                 iommu->flush.flush_context(iommu, 0, 0, 0,
3949                                            DMA_CCMD_GLOBAL_INVL);
3950                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3951                 iommu_enable_translation(iommu);
3952                 iommu_disable_protect_mem_regions(iommu);
3953         }
3954
3955         return 0;
3956 }
3957
3958 static void iommu_flush_all(void)
3959 {
3960         struct dmar_drhd_unit *drhd;
3961         struct intel_iommu *iommu;
3962
3963         for_each_active_iommu(iommu, drhd) {
3964                 iommu->flush.flush_context(iommu, 0, 0, 0,
3965                                            DMA_CCMD_GLOBAL_INVL);
3966                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3967                                          DMA_TLB_GLOBAL_FLUSH);
3968         }
3969 }
3970
3971 static int iommu_suspend(void)
3972 {
3973         struct dmar_drhd_unit *drhd;
3974         struct intel_iommu *iommu = NULL;
3975         unsigned long flag;
3976
3977         for_each_active_iommu(iommu, drhd) {
3978                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3979                                                  GFP_ATOMIC);
3980                 if (!iommu->iommu_state)
3981                         goto nomem;
3982         }
3983
3984         iommu_flush_all();
3985
3986         for_each_active_iommu(iommu, drhd) {
3987                 iommu_disable_translation(iommu);
3988
3989                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3990
3991                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3992                         readl(iommu->reg + DMAR_FECTL_REG);
3993                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3994                         readl(iommu->reg + DMAR_FEDATA_REG);
3995                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3996                         readl(iommu->reg + DMAR_FEADDR_REG);
3997                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3998                         readl(iommu->reg + DMAR_FEUADDR_REG);
3999
4000                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4001         }
4002         return 0;
4003
4004 nomem:
4005         for_each_active_iommu(iommu, drhd)
4006                 kfree(iommu->iommu_state);
4007
4008         return -ENOMEM;
4009 }
4010
4011 static void iommu_resume(void)
4012 {
4013         struct dmar_drhd_unit *drhd;
4014         struct intel_iommu *iommu = NULL;
4015         unsigned long flag;
4016
4017         if (init_iommu_hw()) {
4018                 if (force_on)
4019                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4020                 else
4021                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4022                 return;
4023         }
4024
4025         for_each_active_iommu(iommu, drhd) {
4026
4027                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4028
4029                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4030                         iommu->reg + DMAR_FECTL_REG);
4031                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4032                         iommu->reg + DMAR_FEDATA_REG);
4033                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4034                         iommu->reg + DMAR_FEADDR_REG);
4035                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4036                         iommu->reg + DMAR_FEUADDR_REG);
4037
4038                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4039         }
4040
4041         for_each_active_iommu(iommu, drhd)
4042                 kfree(iommu->iommu_state);
4043 }
4044
4045 static struct syscore_ops iommu_syscore_ops = {
4046         .resume         = iommu_resume,
4047         .suspend        = iommu_suspend,
4048 };
4049
4050 static void __init init_iommu_pm_ops(void)
4051 {
4052         register_syscore_ops(&iommu_syscore_ops);
4053 }
4054
4055 #else
4056 static inline void init_iommu_pm_ops(void) {}
4057 #endif  /* CONFIG_PM */
4058
4059
4060 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4061 {
4062         struct acpi_dmar_reserved_memory *rmrr;
4063         struct dmar_rmrr_unit *rmrru;
4064
4065         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4066         if (!rmrru)
4067                 return -ENOMEM;
4068
4069         rmrru->hdr = header;
4070         rmrr = (struct acpi_dmar_reserved_memory *)header;
4071         rmrru->base_address = rmrr->base_address;
4072         rmrru->end_address = rmrr->end_address;
4073         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4074                                 ((void *)rmrr) + rmrr->header.length,
4075                                 &rmrru->devices_cnt);
4076         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4077                 kfree(rmrru);
4078                 return -ENOMEM;
4079         }
4080
4081         list_add(&rmrru->list, &dmar_rmrr_units);
4082
4083         return 0;
4084 }
4085
4086 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4087 {
4088         struct dmar_atsr_unit *atsru;
4089         struct acpi_dmar_atsr *tmp;
4090
4091         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4092                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4093                 if (atsr->segment != tmp->segment)
4094                         continue;
4095                 if (atsr->header.length != tmp->header.length)
4096                         continue;
4097                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4098                         return atsru;
4099         }
4100
4101         return NULL;
4102 }
4103
4104 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4105 {
4106         struct acpi_dmar_atsr *atsr;
4107         struct dmar_atsr_unit *atsru;
4108
4109         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4110                 return 0;
4111
4112         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4113         atsru = dmar_find_atsr(atsr);
4114         if (atsru)
4115                 return 0;
4116
4117         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4118         if (!atsru)
4119                 return -ENOMEM;
4120
4121         /*
4122          * If memory is allocated from slab by ACPI _DSM method, we need to
4123          * copy the memory content because the memory buffer will be freed
4124          * on return.
4125          */
4126         atsru->hdr = (void *)(atsru + 1);
4127         memcpy(atsru->hdr, hdr, hdr->length);
4128         atsru->include_all = atsr->flags & 0x1;
4129         if (!atsru->include_all) {
4130                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4131                                 (void *)atsr + atsr->header.length,
4132                                 &atsru->devices_cnt);
4133                 if (atsru->devices_cnt && atsru->devices == NULL) {
4134                         kfree(atsru);
4135                         return -ENOMEM;
4136                 }
4137         }
4138
4139         list_add_rcu(&atsru->list, &dmar_atsr_units);
4140
4141         return 0;
4142 }
4143
4144 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4145 {
4146         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4147         kfree(atsru);
4148 }
4149
4150 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4151 {
4152         struct acpi_dmar_atsr *atsr;
4153         struct dmar_atsr_unit *atsru;
4154
4155         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4156         atsru = dmar_find_atsr(atsr);
4157         if (atsru) {
4158                 list_del_rcu(&atsru->list);
4159                 synchronize_rcu();
4160                 intel_iommu_free_atsr(atsru);
4161         }
4162
4163         return 0;
4164 }
4165
4166 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4167 {
4168         int i;
4169         struct device *dev;
4170         struct acpi_dmar_atsr *atsr;
4171         struct dmar_atsr_unit *atsru;
4172
4173         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4174         atsru = dmar_find_atsr(atsr);
4175         if (!atsru)
4176                 return 0;
4177
4178         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4179                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4180                                           i, dev)
4181                         return -EBUSY;
4182
4183         return 0;
4184 }
4185
4186 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4187 {
4188         int sp, ret = 0;
4189         struct intel_iommu *iommu = dmaru->iommu;
4190
4191         if (g_iommus[iommu->seq_id])
4192                 return 0;
4193
4194         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4195                 pr_warn("%s: Doesn't support hardware pass through.\n",
4196                         iommu->name);
4197                 return -ENXIO;
4198         }
4199         if (!ecap_sc_support(iommu->ecap) &&
4200             domain_update_iommu_snooping(iommu)) {
4201                 pr_warn("%s: Doesn't support snooping.\n",
4202                         iommu->name);
4203                 return -ENXIO;
4204         }
4205         sp = domain_update_iommu_superpage(iommu) - 1;
4206         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4207                 pr_warn("%s: Doesn't support large page.\n",
4208                         iommu->name);
4209                 return -ENXIO;
4210         }
4211
4212         /*
4213          * Disable translation if already enabled prior to OS handover.
4214          */
4215         if (iommu->gcmd & DMA_GCMD_TE)
4216                 iommu_disable_translation(iommu);
4217
4218         g_iommus[iommu->seq_id] = iommu;
4219         ret = iommu_init_domains(iommu);
4220         if (ret == 0)
4221                 ret = iommu_alloc_root_entry(iommu);
4222         if (ret)
4223                 goto out;
4224
4225 #ifdef CONFIG_INTEL_IOMMU_SVM
4226         if (pasid_enabled(iommu))
4227                 intel_svm_alloc_pasid_tables(iommu);
4228 #endif
4229
4230         if (dmaru->ignored) {
4231                 /*
4232                  * we always have to disable PMRs or DMA may fail on this device
4233                  */
4234                 if (force_on)
4235                         iommu_disable_protect_mem_regions(iommu);
4236                 return 0;
4237         }
4238
4239         intel_iommu_init_qi(iommu);
4240         iommu_flush_write_buffer(iommu);
4241
4242 #ifdef CONFIG_INTEL_IOMMU_SVM
4243         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4244                 ret = intel_svm_enable_prq(iommu);
4245                 if (ret)
4246                         goto disable_iommu;
4247         }
4248 #endif
4249         ret = dmar_set_interrupt(iommu);
4250         if (ret)
4251                 goto disable_iommu;
4252
4253         iommu_set_root_entry(iommu);
4254         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4255         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4256         iommu_enable_translation(iommu);
4257
4258         iommu_disable_protect_mem_regions(iommu);
4259         return 0;
4260
4261 disable_iommu:
4262         disable_dmar_iommu(iommu);
4263 out:
4264         free_dmar_iommu(iommu);
4265         return ret;
4266 }
4267
4268 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4269 {
4270         int ret = 0;
4271         struct intel_iommu *iommu = dmaru->iommu;
4272
4273         if (!intel_iommu_enabled)
4274                 return 0;
4275         if (iommu == NULL)
4276                 return -EINVAL;
4277
4278         if (insert) {
4279                 ret = intel_iommu_add(dmaru);
4280         } else {
4281                 disable_dmar_iommu(iommu);
4282                 free_dmar_iommu(iommu);
4283         }
4284
4285         return ret;
4286 }
4287
4288 static void intel_iommu_free_dmars(void)
4289 {
4290         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4291         struct dmar_atsr_unit *atsru, *atsr_n;
4292
4293         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4294                 list_del(&rmrru->list);
4295                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4296                 kfree(rmrru);
4297         }
4298
4299         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4300                 list_del(&atsru->list);
4301                 intel_iommu_free_atsr(atsru);
4302         }
4303 }
4304
4305 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4306 {
4307         int i, ret = 1;
4308         struct pci_bus *bus;
4309         struct pci_dev *bridge = NULL;
4310         struct device *tmp;
4311         struct acpi_dmar_atsr *atsr;
4312         struct dmar_atsr_unit *atsru;
4313
4314         dev = pci_physfn(dev);
4315         for (bus = dev->bus; bus; bus = bus->parent) {
4316                 bridge = bus->self;
4317                 /* If it's an integrated device, allow ATS */
4318                 if (!bridge)
4319                         return 1;
4320                 /* Connected via non-PCIe: no ATS */
4321                 if (!pci_is_pcie(bridge) ||
4322                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4323                         return 0;
4324                 /* If we found the root port, look it up in the ATSR */
4325                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4326                         break;
4327         }
4328
4329         rcu_read_lock();
4330         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4331                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4332                 if (atsr->segment != pci_domain_nr(dev->bus))
4333                         continue;
4334
4335                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4336                         if (tmp == &bridge->dev)
4337                                 goto out;
4338
4339                 if (atsru->include_all)
4340                         goto out;
4341         }
4342         ret = 0;
4343 out:
4344         rcu_read_unlock();
4345
4346         return ret;
4347 }
4348
4349 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4350 {
4351         int ret = 0;
4352         struct dmar_rmrr_unit *rmrru;
4353         struct dmar_atsr_unit *atsru;
4354         struct acpi_dmar_atsr *atsr;
4355         struct acpi_dmar_reserved_memory *rmrr;
4356
4357         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4358                 return 0;
4359
4360         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4361                 rmrr = container_of(rmrru->hdr,
4362                                     struct acpi_dmar_reserved_memory, header);
4363                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4364                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4365                                 ((void *)rmrr) + rmrr->header.length,
4366                                 rmrr->segment, rmrru->devices,
4367                                 rmrru->devices_cnt);
4368                         if(ret < 0)
4369                                 return ret;
4370                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4371                         dmar_remove_dev_scope(info, rmrr->segment,
4372                                 rmrru->devices, rmrru->devices_cnt);
4373                 }
4374         }
4375
4376         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4377                 if (atsru->include_all)
4378                         continue;
4379
4380                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4381                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4382                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4383                                         (void *)atsr + atsr->header.length,
4384                                         atsr->segment, atsru->devices,
4385                                         atsru->devices_cnt);
4386                         if (ret > 0)
4387                                 break;
4388                         else if(ret < 0)
4389                                 return ret;
4390                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4391                         if (dmar_remove_dev_scope(info, atsr->segment,
4392                                         atsru->devices, atsru->devices_cnt))
4393                                 break;
4394                 }
4395         }
4396
4397         return 0;
4398 }
4399
4400 /*
4401  * Here we only respond to action of unbound device from driver.
4402  *
4403  * Added device is not attached to its DMAR domain here yet. That will happen
4404  * when mapping the device to iova.
4405  */
4406 static int device_notifier(struct notifier_block *nb,
4407                                   unsigned long action, void *data)
4408 {
4409         struct device *dev = data;
4410         struct dmar_domain *domain;
4411
4412         if (iommu_dummy(dev))
4413                 return 0;
4414
4415         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4416                 return 0;
4417
4418         domain = find_domain(dev);
4419         if (!domain)
4420                 return 0;
4421
4422         dmar_remove_one_dev_info(domain, dev);
4423         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4424                 domain_exit(domain);
4425
4426         return 0;
4427 }
4428
4429 static struct notifier_block device_nb = {
4430         .notifier_call = device_notifier,
4431 };
4432
4433 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4434                                        unsigned long val, void *v)
4435 {
4436         struct memory_notify *mhp = v;
4437         unsigned long long start, end;
4438         unsigned long start_vpfn, last_vpfn;
4439
4440         switch (val) {
4441         case MEM_GOING_ONLINE:
4442                 start = mhp->start_pfn << PAGE_SHIFT;
4443                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4444                 if (iommu_domain_identity_map(si_domain, start, end)) {
4445                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4446                                 start, end);
4447                         return NOTIFY_BAD;
4448                 }
4449                 break;
4450
4451         case MEM_OFFLINE:
4452         case MEM_CANCEL_ONLINE:
4453                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4454                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4455                 while (start_vpfn <= last_vpfn) {
4456                         struct iova *iova;
4457                         struct dmar_drhd_unit *drhd;
4458                         struct intel_iommu *iommu;
4459                         struct page *freelist;
4460
4461                         iova = find_iova(&si_domain->iovad, start_vpfn);
4462                         if (iova == NULL) {
4463                                 pr_debug("Failed get IOVA for PFN %lx\n",
4464                                          start_vpfn);
4465                                 break;
4466                         }
4467
4468                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4469                                                      start_vpfn, last_vpfn);
4470                         if (iova == NULL) {
4471                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4472                                         start_vpfn, last_vpfn);
4473                                 return NOTIFY_BAD;
4474                         }
4475
4476                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4477                                                iova->pfn_hi);
4478
4479                         rcu_read_lock();
4480                         for_each_active_iommu(iommu, drhd)
4481                                 iommu_flush_iotlb_psi(iommu, si_domain,
4482                                         iova->pfn_lo, iova_size(iova),
4483                                         !freelist, 0);
4484                         rcu_read_unlock();
4485                         dma_free_pagelist(freelist);
4486
4487                         start_vpfn = iova->pfn_hi + 1;
4488                         free_iova_mem(iova);
4489                 }
4490                 break;
4491         }
4492
4493         return NOTIFY_OK;
4494 }
4495
4496 static struct notifier_block intel_iommu_memory_nb = {
4497         .notifier_call = intel_iommu_memory_notifier,
4498         .priority = 0
4499 };
4500
4501
4502 static ssize_t intel_iommu_show_version(struct device *dev,
4503                                         struct device_attribute *attr,
4504                                         char *buf)
4505 {
4506         struct intel_iommu *iommu = dev_get_drvdata(dev);
4507         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4508         return sprintf(buf, "%d:%d\n",
4509                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4510 }
4511 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4512
4513 static ssize_t intel_iommu_show_address(struct device *dev,
4514                                         struct device_attribute *attr,
4515                                         char *buf)
4516 {
4517         struct intel_iommu *iommu = dev_get_drvdata(dev);
4518         return sprintf(buf, "%llx\n", iommu->reg_phys);
4519 }
4520 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4521
4522 static ssize_t intel_iommu_show_cap(struct device *dev,
4523                                     struct device_attribute *attr,
4524                                     char *buf)
4525 {
4526         struct intel_iommu *iommu = dev_get_drvdata(dev);
4527         return sprintf(buf, "%llx\n", iommu->cap);
4528 }
4529 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4530
4531 static ssize_t intel_iommu_show_ecap(struct device *dev,
4532                                     struct device_attribute *attr,
4533                                     char *buf)
4534 {
4535         struct intel_iommu *iommu = dev_get_drvdata(dev);
4536         return sprintf(buf, "%llx\n", iommu->ecap);
4537 }
4538 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4539
4540 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4541                                       struct device_attribute *attr,
4542                                       char *buf)
4543 {
4544         struct intel_iommu *iommu = dev_get_drvdata(dev);
4545         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4546 }
4547 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4548
4549 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4550                                            struct device_attribute *attr,
4551                                            char *buf)
4552 {
4553         struct intel_iommu *iommu = dev_get_drvdata(dev);
4554         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4555                                                   cap_ndoms(iommu->cap)));
4556 }
4557 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4558
4559 static struct attribute *intel_iommu_attrs[] = {
4560         &dev_attr_version.attr,
4561         &dev_attr_address.attr,
4562         &dev_attr_cap.attr,
4563         &dev_attr_ecap.attr,
4564         &dev_attr_domains_supported.attr,
4565         &dev_attr_domains_used.attr,
4566         NULL,
4567 };
4568
4569 static struct attribute_group intel_iommu_group = {
4570         .name = "intel-iommu",
4571         .attrs = intel_iommu_attrs,
4572 };
4573
4574 const struct attribute_group *intel_iommu_groups[] = {
4575         &intel_iommu_group,
4576         NULL,
4577 };
4578
4579 int __init intel_iommu_init(void)
4580 {
4581         int ret = -ENODEV;
4582         struct dmar_drhd_unit *drhd;
4583         struct intel_iommu *iommu;
4584
4585         /* VT-d is required for a TXT/tboot launch, so enforce that */
4586         force_on = tboot_force_iommu();
4587
4588         if (iommu_init_mempool()) {
4589                 if (force_on)
4590                         panic("tboot: Failed to initialize iommu memory\n");
4591                 return -ENOMEM;
4592         }
4593
4594         down_write(&dmar_global_lock);
4595         if (dmar_table_init()) {
4596                 if (force_on)
4597                         panic("tboot: Failed to initialize DMAR table\n");
4598                 goto out_free_dmar;
4599         }
4600
4601         if (dmar_dev_scope_init() < 0) {
4602                 if (force_on)
4603                         panic("tboot: Failed to initialize DMAR device scope\n");
4604                 goto out_free_dmar;
4605         }
4606
4607         if (no_iommu || dmar_disabled)
4608                 goto out_free_dmar;
4609
4610         if (list_empty(&dmar_rmrr_units))
4611                 pr_info("No RMRR found\n");
4612
4613         if (list_empty(&dmar_atsr_units))
4614                 pr_info("No ATSR found\n");
4615
4616         if (dmar_init_reserved_ranges()) {
4617                 if (force_on)
4618                         panic("tboot: Failed to reserve iommu ranges\n");
4619                 goto out_free_reserved_range;
4620         }
4621
4622         init_no_remapping_devices();
4623
4624         ret = init_dmars();
4625         if (ret) {
4626                 if (force_on)
4627                         panic("tboot: Failed to initialize DMARs\n");
4628                 pr_err("Initialization failed\n");
4629                 goto out_free_reserved_range;
4630         }
4631         up_write(&dmar_global_lock);
4632         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4633
4634         init_timer(&unmap_timer);
4635 #ifdef CONFIG_SWIOTLB
4636         swiotlb = 0;
4637 #endif
4638         dma_ops = &intel_dma_ops;
4639
4640         init_iommu_pm_ops();
4641
4642         for_each_active_iommu(iommu, drhd)
4643                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4644                                                        intel_iommu_groups,
4645                                                        "%s", iommu->name);
4646
4647         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4648         bus_register_notifier(&pci_bus_type, &device_nb);
4649         if (si_domain && !hw_pass_through)
4650                 register_memory_notifier(&intel_iommu_memory_nb);
4651
4652         intel_iommu_enabled = 1;
4653
4654         return 0;
4655
4656 out_free_reserved_range:
4657         put_iova_domain(&reserved_iova_list);
4658 out_free_dmar:
4659         intel_iommu_free_dmars();
4660         up_write(&dmar_global_lock);
4661         iommu_exit_mempool();
4662         return ret;
4663 }
4664
4665 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4666 {
4667         struct intel_iommu *iommu = opaque;
4668
4669         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4670         return 0;
4671 }
4672
4673 /*
4674  * NB - intel-iommu lacks any sort of reference counting for the users of
4675  * dependent devices.  If multiple endpoints have intersecting dependent
4676  * devices, unbinding the driver from any one of them will possibly leave
4677  * the others unable to operate.
4678  */
4679 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4680 {
4681         if (!iommu || !dev || !dev_is_pci(dev))
4682                 return;
4683
4684         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4685 }
4686
4687 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4688 {
4689         struct intel_iommu *iommu;
4690         unsigned long flags;
4691
4692         assert_spin_locked(&device_domain_lock);
4693
4694         if (WARN_ON(!info))
4695                 return;
4696
4697         iommu = info->iommu;
4698
4699         if (info->dev) {
4700                 iommu_disable_dev_iotlb(info);
4701                 domain_context_clear(iommu, info->dev);
4702         }
4703
4704         unlink_domain_info(info);
4705
4706         spin_lock_irqsave(&iommu->lock, flags);
4707         domain_detach_iommu(info->domain, iommu);
4708         spin_unlock_irqrestore(&iommu->lock, flags);
4709
4710         free_devinfo_mem(info);
4711 }
4712
4713 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4714                                      struct device *dev)
4715 {
4716         struct device_domain_info *info;
4717         unsigned long flags;
4718
4719         spin_lock_irqsave(&device_domain_lock, flags);
4720         info = dev->archdata.iommu;
4721         __dmar_remove_one_dev_info(info);
4722         spin_unlock_irqrestore(&device_domain_lock, flags);
4723 }
4724
4725 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4726 {
4727         int adjust_width;
4728
4729         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4730                         DMA_32BIT_PFN);
4731         domain_reserve_special_ranges(domain);
4732
4733         /* calculate AGAW */
4734         domain->gaw = guest_width;
4735         adjust_width = guestwidth_to_adjustwidth(guest_width);
4736         domain->agaw = width_to_agaw(adjust_width);
4737
4738         domain->iommu_coherency = 0;
4739         domain->iommu_snooping = 0;
4740         domain->iommu_superpage = 0;
4741         domain->max_addr = 0;
4742
4743         /* always allocate the top pgd */
4744         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4745         if (!domain->pgd)
4746                 return -ENOMEM;
4747         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4748         return 0;
4749 }
4750
4751 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4752 {
4753         struct dmar_domain *dmar_domain;
4754         struct iommu_domain *domain;
4755
4756         if (type != IOMMU_DOMAIN_UNMANAGED)
4757                 return NULL;
4758
4759         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4760         if (!dmar_domain) {
4761                 pr_err("Can't allocate dmar_domain\n");
4762                 return NULL;
4763         }
4764         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4765                 pr_err("Domain initialization failed\n");
4766                 domain_exit(dmar_domain);
4767                 return NULL;
4768         }
4769         domain_update_iommu_cap(dmar_domain);
4770
4771         domain = &dmar_domain->domain;
4772         domain->geometry.aperture_start = 0;
4773         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4774         domain->geometry.force_aperture = true;
4775
4776         return domain;
4777 }
4778
4779 static void intel_iommu_domain_free(struct iommu_domain *domain)
4780 {
4781         domain_exit(to_dmar_domain(domain));
4782 }
4783
4784 static int intel_iommu_attach_device(struct iommu_domain *domain,
4785                                      struct device *dev)
4786 {
4787         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4788         struct intel_iommu *iommu;
4789         int addr_width;
4790         u8 bus, devfn;
4791
4792         if (device_is_rmrr_locked(dev)) {
4793                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4794                 return -EPERM;
4795         }
4796
4797         /* normally dev is not mapped */
4798         if (unlikely(domain_context_mapped(dev))) {
4799                 struct dmar_domain *old_domain;
4800
4801                 old_domain = find_domain(dev);
4802                 if (old_domain) {
4803                         rcu_read_lock();
4804                         dmar_remove_one_dev_info(old_domain, dev);
4805                         rcu_read_unlock();
4806
4807                         if (!domain_type_is_vm_or_si(old_domain) &&
4808                              list_empty(&old_domain->devices))
4809                                 domain_exit(old_domain);
4810                 }
4811         }
4812
4813         iommu = device_to_iommu(dev, &bus, &devfn);
4814         if (!iommu)
4815                 return -ENODEV;
4816
4817         /* check if this iommu agaw is sufficient for max mapped address */
4818         addr_width = agaw_to_width(iommu->agaw);
4819         if (addr_width > cap_mgaw(iommu->cap))
4820                 addr_width = cap_mgaw(iommu->cap);
4821
4822         if (dmar_domain->max_addr > (1LL << addr_width)) {
4823                 pr_err("%s: iommu width (%d) is not "
4824                        "sufficient for the mapped address (%llx)\n",
4825                        __func__, addr_width, dmar_domain->max_addr);
4826                 return -EFAULT;
4827         }
4828         dmar_domain->gaw = addr_width;
4829
4830         /*
4831          * Knock out extra levels of page tables if necessary
4832          */
4833         while (iommu->agaw < dmar_domain->agaw) {
4834                 struct dma_pte *pte;
4835
4836                 pte = dmar_domain->pgd;
4837                 if (dma_pte_present(pte)) {
4838                         dmar_domain->pgd = (struct dma_pte *)
4839                                 phys_to_virt(dma_pte_addr(pte));
4840                         free_pgtable_page(pte);
4841                 }
4842                 dmar_domain->agaw--;
4843         }
4844
4845         return domain_add_dev_info(dmar_domain, dev);
4846 }
4847
4848 static void intel_iommu_detach_device(struct iommu_domain *domain,
4849                                       struct device *dev)
4850 {
4851         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4852 }
4853
4854 static int intel_iommu_map(struct iommu_domain *domain,
4855                            unsigned long iova, phys_addr_t hpa,
4856                            size_t size, int iommu_prot)
4857 {
4858         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4859         u64 max_addr;
4860         int prot = 0;
4861         int ret;
4862
4863         if (iommu_prot & IOMMU_READ)
4864                 prot |= DMA_PTE_READ;
4865         if (iommu_prot & IOMMU_WRITE)
4866                 prot |= DMA_PTE_WRITE;
4867         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4868                 prot |= DMA_PTE_SNP;
4869
4870         max_addr = iova + size;
4871         if (dmar_domain->max_addr < max_addr) {
4872                 u64 end;
4873
4874                 /* check if minimum agaw is sufficient for mapped address */
4875                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4876                 if (end < max_addr) {
4877                         pr_err("%s: iommu width (%d) is not "
4878                                "sufficient for the mapped address (%llx)\n",
4879                                __func__, dmar_domain->gaw, max_addr);
4880                         return -EFAULT;
4881                 }
4882                 dmar_domain->max_addr = max_addr;
4883         }
4884         /* Round up size to next multiple of PAGE_SIZE, if it and
4885            the low bits of hpa would take us onto the next page */
4886         size = aligned_nrpages(hpa, size);
4887         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4888                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4889         return ret;
4890 }
4891
4892 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4893                                 unsigned long iova, size_t size)
4894 {
4895         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4896         struct page *freelist = NULL;
4897         struct intel_iommu *iommu;
4898         unsigned long start_pfn, last_pfn;
4899         unsigned int npages;
4900         int iommu_id, level = 0;
4901
4902         /* Cope with horrid API which requires us to unmap more than the
4903            size argument if it happens to be a large-page mapping. */
4904         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4905
4906         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4907                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4908
4909         start_pfn = iova >> VTD_PAGE_SHIFT;
4910         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4911
4912         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4913
4914         npages = last_pfn - start_pfn + 1;
4915
4916         for_each_domain_iommu(iommu_id, dmar_domain) {
4917                 iommu = g_iommus[iommu_id];
4918
4919                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4920                                       start_pfn, npages, !freelist, 0);
4921         }
4922
4923         dma_free_pagelist(freelist);
4924
4925         if (dmar_domain->max_addr == iova + size)
4926                 dmar_domain->max_addr = iova;
4927
4928         return size;
4929 }
4930
4931 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4932                                             dma_addr_t iova)
4933 {
4934         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4935         struct dma_pte *pte;
4936         int level = 0;
4937         u64 phys = 0;
4938
4939         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4940         if (pte)
4941                 phys = dma_pte_addr(pte);
4942
4943         return phys;
4944 }
4945
4946 static bool intel_iommu_capable(enum iommu_cap cap)
4947 {
4948         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4949                 return domain_update_iommu_snooping(NULL) == 1;
4950         if (cap == IOMMU_CAP_INTR_REMAP)
4951                 return irq_remapping_enabled == 1;
4952
4953         return false;
4954 }
4955
4956 static int intel_iommu_add_device(struct device *dev)
4957 {
4958         struct intel_iommu *iommu;
4959         struct iommu_group *group;
4960         u8 bus, devfn;
4961
4962         iommu = device_to_iommu(dev, &bus, &devfn);
4963         if (!iommu)
4964                 return -ENODEV;
4965
4966         iommu_device_link(iommu->iommu_dev, dev);
4967
4968         group = iommu_group_get_for_dev(dev);
4969
4970         if (IS_ERR(group))
4971                 return PTR_ERR(group);
4972
4973         iommu_group_put(group);
4974         return 0;
4975 }
4976
4977 static void intel_iommu_remove_device(struct device *dev)
4978 {
4979         struct intel_iommu *iommu;
4980         u8 bus, devfn;
4981
4982         iommu = device_to_iommu(dev, &bus, &devfn);
4983         if (!iommu)
4984                 return;
4985
4986         iommu_group_remove_device(dev);
4987
4988         iommu_device_unlink(iommu->iommu_dev, dev);
4989 }
4990
4991 #ifdef CONFIG_INTEL_IOMMU_SVM
4992 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
4993 {
4994         struct device_domain_info *info;
4995         struct context_entry *context;
4996         struct dmar_domain *domain;
4997         unsigned long flags;
4998         u64 ctx_lo;
4999         int ret;
5000
5001         domain = get_valid_domain_for_dev(sdev->dev);
5002         if (!domain)
5003                 return -EINVAL;
5004
5005         spin_lock_irqsave(&device_domain_lock, flags);
5006         spin_lock(&iommu->lock);
5007
5008         ret = -EINVAL;
5009         info = sdev->dev->archdata.iommu;
5010         if (!info || !info->pasid_supported)
5011                 goto out;
5012
5013         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5014         if (WARN_ON(!context))
5015                 goto out;
5016
5017         ctx_lo = context[0].lo;
5018
5019         sdev->did = domain->iommu_did[iommu->seq_id];
5020         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5021
5022         if (!(ctx_lo & CONTEXT_PASIDE)) {
5023                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5024                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
5025                 wmb();
5026                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5027                  * extended to permit requests-with-PASID if the PASIDE bit
5028                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5029                  * however, the PASIDE bit is ignored and requests-with-PASID
5030                  * are unconditionally blocked. Which makes less sense.
5031                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5032                  * "guest mode" translation types depending on whether ATS
5033                  * is available or not. Annoyingly, we can't use the new
5034                  * modes *unless* PASIDE is set. */
5035                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5036                         ctx_lo &= ~CONTEXT_TT_MASK;
5037                         if (info->ats_supported)
5038                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5039                         else
5040                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5041                 }
5042                 ctx_lo |= CONTEXT_PASIDE;
5043                 if (iommu->pasid_state_table)
5044                         ctx_lo |= CONTEXT_DINVE;
5045                 if (info->pri_supported)
5046                         ctx_lo |= CONTEXT_PRS;
5047                 context[0].lo = ctx_lo;
5048                 wmb();
5049                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5050                                            DMA_CCMD_MASK_NOBIT,
5051                                            DMA_CCMD_DEVICE_INVL);
5052         }
5053
5054         /* Enable PASID support in the device, if it wasn't already */
5055         if (!info->pasid_enabled)
5056                 iommu_enable_dev_iotlb(info);
5057
5058         if (info->ats_enabled) {
5059                 sdev->dev_iotlb = 1;
5060                 sdev->qdep = info->ats_qdep;
5061                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5062                         sdev->qdep = 0;
5063         }
5064         ret = 0;
5065
5066  out:
5067         spin_unlock(&iommu->lock);
5068         spin_unlock_irqrestore(&device_domain_lock, flags);
5069
5070         return ret;
5071 }
5072
5073 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5074 {
5075         struct intel_iommu *iommu;
5076         u8 bus, devfn;
5077
5078         if (iommu_dummy(dev)) {
5079                 dev_warn(dev,
5080                          "No IOMMU translation for device; cannot enable SVM\n");
5081                 return NULL;
5082         }
5083
5084         iommu = device_to_iommu(dev, &bus, &devfn);
5085         if ((!iommu)) {
5086                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5087                 return NULL;
5088         }
5089
5090         if (!iommu->pasid_table) {
5091                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5092                 return NULL;
5093         }
5094
5095         return iommu;
5096 }
5097 #endif /* CONFIG_INTEL_IOMMU_SVM */
5098
5099 static const struct iommu_ops intel_iommu_ops = {
5100         .capable        = intel_iommu_capable,
5101         .domain_alloc   = intel_iommu_domain_alloc,
5102         .domain_free    = intel_iommu_domain_free,
5103         .attach_dev     = intel_iommu_attach_device,
5104         .detach_dev     = intel_iommu_detach_device,
5105         .map            = intel_iommu_map,
5106         .unmap          = intel_iommu_unmap,
5107         .map_sg         = default_iommu_map_sg,
5108         .iova_to_phys   = intel_iommu_iova_to_phys,
5109         .add_device     = intel_iommu_add_device,
5110         .remove_device  = intel_iommu_remove_device,
5111         .device_group   = pci_device_group,
5112         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5113 };
5114
5115 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5116 {
5117         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5118         pr_info("Disabling IOMMU for graphics on this chipset\n");
5119         dmar_map_gfx = 0;
5120 }
5121
5122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5129
5130 static void quirk_iommu_rwbf(struct pci_dev *dev)
5131 {
5132         /*
5133          * Mobile 4 Series Chipset neglects to set RWBF capability,
5134          * but needs it. Same seems to hold for the desktop versions.
5135          */
5136         pr_info("Forcing write-buffer flush capability\n");
5137         rwbf_quirk = 1;
5138 }
5139
5140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5147
5148 #define GGC 0x52
5149 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5150 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5151 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5152 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5153 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5154 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5155 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5156 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5157
5158 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5159 {
5160         unsigned short ggc;
5161
5162         if (pci_read_config_word(dev, GGC, &ggc))
5163                 return;
5164
5165         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5166                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5167                 dmar_map_gfx = 0;
5168         } else if (dmar_map_gfx) {
5169                 /* we have to ensure the gfx device is idle before we flush */
5170                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5171                 intel_iommu_strict = 1;
5172        }
5173 }
5174 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5175 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5176 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5177 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5178
5179 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5180    ISOCH DMAR unit for the Azalia sound device, but not give it any
5181    TLB entries, which causes it to deadlock. Check for that.  We do
5182    this in a function called from init_dmars(), instead of in a PCI
5183    quirk, because we don't want to print the obnoxious "BIOS broken"
5184    message if VT-d is actually disabled.
5185 */
5186 static void __init check_tylersburg_isoch(void)
5187 {
5188         struct pci_dev *pdev;
5189         uint32_t vtisochctrl;
5190
5191         /* If there's no Azalia in the system anyway, forget it. */
5192         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5193         if (!pdev)
5194                 return;
5195         pci_dev_put(pdev);
5196
5197         /* System Management Registers. Might be hidden, in which case
5198            we can't do the sanity check. But that's OK, because the
5199            known-broken BIOSes _don't_ actually hide it, so far. */
5200         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5201         if (!pdev)
5202                 return;
5203
5204         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5205                 pci_dev_put(pdev);
5206                 return;
5207         }
5208
5209         pci_dev_put(pdev);
5210
5211         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5212         if (vtisochctrl & 1)
5213                 return;
5214
5215         /* Drop all bits other than the number of TLB entries */
5216         vtisochctrl &= 0x1c;
5217
5218         /* If we have the recommended number of TLB entries (16), fine. */
5219         if (vtisochctrl == 0x10)
5220                 return;
5221
5222         /* Zero TLB entries? You get to ride the short bus to school. */
5223         if (!vtisochctrl) {
5224                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5225                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5226                      dmi_get_system_info(DMI_BIOS_VENDOR),
5227                      dmi_get_system_info(DMI_BIOS_VERSION),
5228                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5229                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5230                 return;
5231         }
5232
5233         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5234                vtisochctrl);
5235 }