]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
Merge tag 'mfd-fixes-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /*
368  * Domain represents a virtual machine, more than one devices
369  * across iommus may be owned in one domain, e.g. kvm guest.
370  */
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
372
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
375
376 #define for_each_domain_iommu(idx, domain)                      \
377         for (idx = 0; idx < g_num_of_iommus; idx++)             \
378                 if (domain->iommu_refcnt[idx])
379
380 struct dmar_domain {
381         int     nid;                    /* node id */
382
383         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
384                                         /* Refcount of devices per iommu */
385
386
387         u16             iommu_did[DMAR_UNITS_SUPPORTED];
388                                         /* Domain ids per IOMMU. Use u16 since
389                                          * domain ids are 16 bit wide according
390                                          * to VT-d spec, section 9.3 */
391
392         struct list_head devices;       /* all devices' list */
393         struct iova_domain iovad;       /* iova's that belong to this domain */
394
395         struct dma_pte  *pgd;           /* virtual address */
396         int             gaw;            /* max guest address width */
397
398         /* adjusted guest address width, 0 is level 2 30-bit */
399         int             agaw;
400
401         int             flags;          /* flags to find out type of domain */
402
403         int             iommu_coherency;/* indicate coherency of iommu access */
404         int             iommu_snooping; /* indicate snooping control feature*/
405         int             iommu_count;    /* reference count of iommu */
406         int             iommu_superpage;/* Level of superpages supported:
407                                            0 == 4KiB (no superpages), 1 == 2MiB,
408                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
409         u64             max_addr;       /* maximum mapped address */
410
411         struct iommu_domain domain;     /* generic domain data structure for
412                                            iommu core */
413 };
414
415 /* PCI domain-device relationship */
416 struct device_domain_info {
417         struct list_head link;  /* link to domain siblings */
418         struct list_head global; /* link to global list */
419         u8 bus;                 /* PCI bus number */
420         u8 devfn;               /* PCI devfn number */
421         struct {
422                 u8 enabled:1;
423                 u8 qdep;
424         } ats;                  /* ATS state */
425         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
426         struct intel_iommu *iommu; /* IOMMU used by this device */
427         struct dmar_domain *domain; /* pointer to domain */
428 };
429
430 struct dmar_rmrr_unit {
431         struct list_head list;          /* list of rmrr units   */
432         struct acpi_dmar_header *hdr;   /* ACPI header          */
433         u64     base_address;           /* reserved base address*/
434         u64     end_address;            /* reserved end address */
435         struct dmar_dev_scope *devices; /* target devices */
436         int     devices_cnt;            /* target device count */
437 };
438
439 struct dmar_atsr_unit {
440         struct list_head list;          /* list of ATSR units */
441         struct acpi_dmar_header *hdr;   /* ACPI header */
442         struct dmar_dev_scope *devices; /* target devices */
443         int devices_cnt;                /* target device count */
444         u8 include_all:1;               /* include all ports */
445 };
446
447 static LIST_HEAD(dmar_atsr_units);
448 static LIST_HEAD(dmar_rmrr_units);
449
450 #define for_each_rmrr_units(rmrr) \
451         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
452
453 static void flush_unmaps_timeout(unsigned long data);
454
455 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
456
457 #define HIGH_WATER_MARK 250
458 struct deferred_flush_tables {
459         int next;
460         struct iova *iova[HIGH_WATER_MARK];
461         struct dmar_domain *domain[HIGH_WATER_MARK];
462         struct page *freelist[HIGH_WATER_MARK];
463 };
464
465 static struct deferred_flush_tables *deferred_flush;
466
467 /* bitmap for indexing intel_iommus */
468 static int g_num_of_iommus;
469
470 static DEFINE_SPINLOCK(async_umap_flush_lock);
471 static LIST_HEAD(unmaps_to_do);
472
473 static int timer_on;
474 static long list_size;
475
476 static void domain_exit(struct dmar_domain *domain);
477 static void domain_remove_dev_info(struct dmar_domain *domain);
478 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
479                                      struct device *dev);
480 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
481 static void domain_context_clear(struct intel_iommu *iommu,
482                                  struct device *dev);
483 static int domain_detach_iommu(struct dmar_domain *domain,
484                                struct intel_iommu *iommu);
485
486 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
487 int dmar_disabled = 0;
488 #else
489 int dmar_disabled = 1;
490 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
491
492 int intel_iommu_enabled = 0;
493 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
494
495 static int dmar_map_gfx = 1;
496 static int dmar_forcedac;
497 static int intel_iommu_strict;
498 static int intel_iommu_superpage = 1;
499 static int intel_iommu_ecs = 1;
500
501 /* We only actually use ECS when PASID support (on the new bit 40)
502  * is also advertised. Some early implementations — the ones with
503  * PASID support on bit 28 — have issues even when we *only* use
504  * extended root/context tables. */
505 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
506                             ecap_pasid(iommu->ecap))
507
508 int intel_iommu_gfx_mapped;
509 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
510
511 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
512 static DEFINE_SPINLOCK(device_domain_lock);
513 static LIST_HEAD(device_domain_list);
514
515 static const struct iommu_ops intel_iommu_ops;
516
517 static bool translation_pre_enabled(struct intel_iommu *iommu)
518 {
519         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
520 }
521
522 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
523 {
524         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
525 }
526
527 static void init_translation_status(struct intel_iommu *iommu)
528 {
529         u32 gsts;
530
531         gsts = readl(iommu->reg + DMAR_GSTS_REG);
532         if (gsts & DMA_GSTS_TES)
533                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
534 }
535
536 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
537 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
538 {
539         return container_of(dom, struct dmar_domain, domain);
540 }
541
542 static int __init intel_iommu_setup(char *str)
543 {
544         if (!str)
545                 return -EINVAL;
546         while (*str) {
547                 if (!strncmp(str, "on", 2)) {
548                         dmar_disabled = 0;
549                         pr_info("IOMMU enabled\n");
550                 } else if (!strncmp(str, "off", 3)) {
551                         dmar_disabled = 1;
552                         pr_info("IOMMU disabled\n");
553                 } else if (!strncmp(str, "igfx_off", 8)) {
554                         dmar_map_gfx = 0;
555                         pr_info("Disable GFX device mapping\n");
556                 } else if (!strncmp(str, "forcedac", 8)) {
557                         pr_info("Forcing DAC for PCI devices\n");
558                         dmar_forcedac = 1;
559                 } else if (!strncmp(str, "strict", 6)) {
560                         pr_info("Disable batched IOTLB flush\n");
561                         intel_iommu_strict = 1;
562                 } else if (!strncmp(str, "sp_off", 6)) {
563                         pr_info("Disable supported super page\n");
564                         intel_iommu_superpage = 0;
565                 } else if (!strncmp(str, "ecs_off", 7)) {
566                         printk(KERN_INFO
567                                 "Intel-IOMMU: disable extended context table support\n");
568                         intel_iommu_ecs = 0;
569                 }
570
571                 str += strcspn(str, ",");
572                 while (*str == ',')
573                         str++;
574         }
575         return 0;
576 }
577 __setup("intel_iommu=", intel_iommu_setup);
578
579 static struct kmem_cache *iommu_domain_cache;
580 static struct kmem_cache *iommu_devinfo_cache;
581
582 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
583 {
584         struct dmar_domain **domains;
585         int idx = did >> 8;
586
587         domains = iommu->domains[idx];
588         if (!domains)
589                 return NULL;
590
591         return domains[did & 0xff];
592 }
593
594 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
595                              struct dmar_domain *domain)
596 {
597         struct dmar_domain **domains;
598         int idx = did >> 8;
599
600         if (!iommu->domains[idx]) {
601                 size_t size = 256 * sizeof(struct dmar_domain *);
602                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
603         }
604
605         domains = iommu->domains[idx];
606         if (WARN_ON(!domains))
607                 return;
608         else
609                 domains[did & 0xff] = domain;
610 }
611
612 static inline void *alloc_pgtable_page(int node)
613 {
614         struct page *page;
615         void *vaddr = NULL;
616
617         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
618         if (page)
619                 vaddr = page_address(page);
620         return vaddr;
621 }
622
623 static inline void free_pgtable_page(void *vaddr)
624 {
625         free_page((unsigned long)vaddr);
626 }
627
628 static inline void *alloc_domain_mem(void)
629 {
630         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
631 }
632
633 static void free_domain_mem(void *vaddr)
634 {
635         kmem_cache_free(iommu_domain_cache, vaddr);
636 }
637
638 static inline void * alloc_devinfo_mem(void)
639 {
640         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
641 }
642
643 static inline void free_devinfo_mem(void *vaddr)
644 {
645         kmem_cache_free(iommu_devinfo_cache, vaddr);
646 }
647
648 static inline int domain_type_is_vm(struct dmar_domain *domain)
649 {
650         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
651 }
652
653 static inline int domain_type_is_si(struct dmar_domain *domain)
654 {
655         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
656 }
657
658 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
659 {
660         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
661                                 DOMAIN_FLAG_STATIC_IDENTITY);
662 }
663
664 static inline int domain_pfn_supported(struct dmar_domain *domain,
665                                        unsigned long pfn)
666 {
667         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
668
669         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
670 }
671
672 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
673 {
674         unsigned long sagaw;
675         int agaw = -1;
676
677         sagaw = cap_sagaw(iommu->cap);
678         for (agaw = width_to_agaw(max_gaw);
679              agaw >= 0; agaw--) {
680                 if (test_bit(agaw, &sagaw))
681                         break;
682         }
683
684         return agaw;
685 }
686
687 /*
688  * Calculate max SAGAW for each iommu.
689  */
690 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
691 {
692         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
693 }
694
695 /*
696  * calculate agaw for each iommu.
697  * "SAGAW" may be different across iommus, use a default agaw, and
698  * get a supported less agaw for iommus that don't support the default agaw.
699  */
700 int iommu_calculate_agaw(struct intel_iommu *iommu)
701 {
702         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
703 }
704
705 /* This functionin only returns single iommu in a domain */
706 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
707 {
708         int iommu_id;
709
710         /* si_domain and vm domain should not get here. */
711         BUG_ON(domain_type_is_vm_or_si(domain));
712         for_each_domain_iommu(iommu_id, domain)
713                 break;
714
715         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
716                 return NULL;
717
718         return g_iommus[iommu_id];
719 }
720
721 static void domain_update_iommu_coherency(struct dmar_domain *domain)
722 {
723         struct dmar_drhd_unit *drhd;
724         struct intel_iommu *iommu;
725         bool found = false;
726         int i;
727
728         domain->iommu_coherency = 1;
729
730         for_each_domain_iommu(i, domain) {
731                 found = true;
732                 if (!ecap_coherent(g_iommus[i]->ecap)) {
733                         domain->iommu_coherency = 0;
734                         break;
735                 }
736         }
737         if (found)
738                 return;
739
740         /* No hardware attached; use lowest common denominator */
741         rcu_read_lock();
742         for_each_active_iommu(iommu, drhd) {
743                 if (!ecap_coherent(iommu->ecap)) {
744                         domain->iommu_coherency = 0;
745                         break;
746                 }
747         }
748         rcu_read_unlock();
749 }
750
751 static int domain_update_iommu_snooping(struct intel_iommu *skip)
752 {
753         struct dmar_drhd_unit *drhd;
754         struct intel_iommu *iommu;
755         int ret = 1;
756
757         rcu_read_lock();
758         for_each_active_iommu(iommu, drhd) {
759                 if (iommu != skip) {
760                         if (!ecap_sc_support(iommu->ecap)) {
761                                 ret = 0;
762                                 break;
763                         }
764                 }
765         }
766         rcu_read_unlock();
767
768         return ret;
769 }
770
771 static int domain_update_iommu_superpage(struct intel_iommu *skip)
772 {
773         struct dmar_drhd_unit *drhd;
774         struct intel_iommu *iommu;
775         int mask = 0xf;
776
777         if (!intel_iommu_superpage) {
778                 return 0;
779         }
780
781         /* set iommu_superpage to the smallest common denominator */
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (iommu != skip) {
785                         mask &= cap_super_page_val(iommu->cap);
786                         if (!mask)
787                                 break;
788                 }
789         }
790         rcu_read_unlock();
791
792         return fls(mask);
793 }
794
795 /* Some capabilities may be different across iommus */
796 static void domain_update_iommu_cap(struct dmar_domain *domain)
797 {
798         domain_update_iommu_coherency(domain);
799         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
800         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
801 }
802
803 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
804                                                        u8 bus, u8 devfn, int alloc)
805 {
806         struct root_entry *root = &iommu->root_entry[bus];
807         struct context_entry *context;
808         u64 *entry;
809
810         entry = &root->lo;
811         if (ecs_enabled(iommu)) {
812                 if (devfn >= 0x80) {
813                         devfn -= 0x80;
814                         entry = &root->hi;
815                 }
816                 devfn *= 2;
817         }
818         if (*entry & 1)
819                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
820         else {
821                 unsigned long phy_addr;
822                 if (!alloc)
823                         return NULL;
824
825                 context = alloc_pgtable_page(iommu->node);
826                 if (!context)
827                         return NULL;
828
829                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
830                 phy_addr = virt_to_phys((void *)context);
831                 *entry = phy_addr | 1;
832                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
833         }
834         return &context[devfn];
835 }
836
837 static int iommu_dummy(struct device *dev)
838 {
839         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
840 }
841
842 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
843 {
844         struct dmar_drhd_unit *drhd = NULL;
845         struct intel_iommu *iommu;
846         struct device *tmp;
847         struct pci_dev *ptmp, *pdev = NULL;
848         u16 segment = 0;
849         int i;
850
851         if (iommu_dummy(dev))
852                 return NULL;
853
854         if (dev_is_pci(dev)) {
855                 pdev = to_pci_dev(dev);
856                 segment = pci_domain_nr(pdev->bus);
857         } else if (has_acpi_companion(dev))
858                 dev = &ACPI_COMPANION(dev)->dev;
859
860         rcu_read_lock();
861         for_each_active_iommu(iommu, drhd) {
862                 if (pdev && segment != drhd->segment)
863                         continue;
864
865                 for_each_active_dev_scope(drhd->devices,
866                                           drhd->devices_cnt, i, tmp) {
867                         if (tmp == dev) {
868                                 *bus = drhd->devices[i].bus;
869                                 *devfn = drhd->devices[i].devfn;
870                                 goto out;
871                         }
872
873                         if (!pdev || !dev_is_pci(tmp))
874                                 continue;
875
876                         ptmp = to_pci_dev(tmp);
877                         if (ptmp->subordinate &&
878                             ptmp->subordinate->number <= pdev->bus->number &&
879                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
880                                 goto got_pdev;
881                 }
882
883                 if (pdev && drhd->include_all) {
884                 got_pdev:
885                         *bus = pdev->bus->number;
886                         *devfn = pdev->devfn;
887                         goto out;
888                 }
889         }
890         iommu = NULL;
891  out:
892         rcu_read_unlock();
893
894         return iommu;
895 }
896
897 static void domain_flush_cache(struct dmar_domain *domain,
898                                void *addr, int size)
899 {
900         if (!domain->iommu_coherency)
901                 clflush_cache_range(addr, size);
902 }
903
904 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
905 {
906         struct context_entry *context;
907         int ret = 0;
908         unsigned long flags;
909
910         spin_lock_irqsave(&iommu->lock, flags);
911         context = iommu_context_addr(iommu, bus, devfn, 0);
912         if (context)
913                 ret = context_present(context);
914         spin_unlock_irqrestore(&iommu->lock, flags);
915         return ret;
916 }
917
918 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
919 {
920         struct context_entry *context;
921         unsigned long flags;
922
923         spin_lock_irqsave(&iommu->lock, flags);
924         context = iommu_context_addr(iommu, bus, devfn, 0);
925         if (context) {
926                 context_clear_entry(context);
927                 __iommu_flush_cache(iommu, context, sizeof(*context));
928         }
929         spin_unlock_irqrestore(&iommu->lock, flags);
930 }
931
932 static void free_context_table(struct intel_iommu *iommu)
933 {
934         int i;
935         unsigned long flags;
936         struct context_entry *context;
937
938         spin_lock_irqsave(&iommu->lock, flags);
939         if (!iommu->root_entry) {
940                 goto out;
941         }
942         for (i = 0; i < ROOT_ENTRY_NR; i++) {
943                 context = iommu_context_addr(iommu, i, 0, 0);
944                 if (context)
945                         free_pgtable_page(context);
946
947                 if (!ecs_enabled(iommu))
948                         continue;
949
950                 context = iommu_context_addr(iommu, i, 0x80, 0);
951                 if (context)
952                         free_pgtable_page(context);
953
954         }
955         free_pgtable_page(iommu->root_entry);
956         iommu->root_entry = NULL;
957 out:
958         spin_unlock_irqrestore(&iommu->lock, flags);
959 }
960
961 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
962                                       unsigned long pfn, int *target_level)
963 {
964         struct dma_pte *parent, *pte = NULL;
965         int level = agaw_to_level(domain->agaw);
966         int offset;
967
968         BUG_ON(!domain->pgd);
969
970         if (!domain_pfn_supported(domain, pfn))
971                 /* Address beyond IOMMU's addressing capabilities. */
972                 return NULL;
973
974         parent = domain->pgd;
975
976         while (1) {
977                 void *tmp_page;
978
979                 offset = pfn_level_offset(pfn, level);
980                 pte = &parent[offset];
981                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
982                         break;
983                 if (level == *target_level)
984                         break;
985
986                 if (!dma_pte_present(pte)) {
987                         uint64_t pteval;
988
989                         tmp_page = alloc_pgtable_page(domain->nid);
990
991                         if (!tmp_page)
992                                 return NULL;
993
994                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
995                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
996                         if (cmpxchg64(&pte->val, 0ULL, pteval))
997                                 /* Someone else set it while we were thinking; use theirs. */
998                                 free_pgtable_page(tmp_page);
999                         else
1000                                 domain_flush_cache(domain, pte, sizeof(*pte));
1001                 }
1002                 if (level == 1)
1003                         break;
1004
1005                 parent = phys_to_virt(dma_pte_addr(pte));
1006                 level--;
1007         }
1008
1009         if (!*target_level)
1010                 *target_level = level;
1011
1012         return pte;
1013 }
1014
1015
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018                                          unsigned long pfn,
1019                                          int level, int *large_page)
1020 {
1021         struct dma_pte *parent, *pte = NULL;
1022         int total = agaw_to_level(domain->agaw);
1023         int offset;
1024
1025         parent = domain->pgd;
1026         while (level <= total) {
1027                 offset = pfn_level_offset(pfn, total);
1028                 pte = &parent[offset];
1029                 if (level == total)
1030                         return pte;
1031
1032                 if (!dma_pte_present(pte)) {
1033                         *large_page = total;
1034                         break;
1035                 }
1036
1037                 if (dma_pte_superpage(pte)) {
1038                         *large_page = total;
1039                         return pte;
1040                 }
1041
1042                 parent = phys_to_virt(dma_pte_addr(pte));
1043                 total--;
1044         }
1045         return NULL;
1046 }
1047
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050                                 unsigned long start_pfn,
1051                                 unsigned long last_pfn)
1052 {
1053         unsigned int large_page = 1;
1054         struct dma_pte *first_pte, *pte;
1055
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         /* we don't need lock here; nobody else touches the iova range */
1061         do {
1062                 large_page = 1;
1063                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064                 if (!pte) {
1065                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1066                         continue;
1067                 }
1068                 do {
1069                         dma_clear_pte(pte);
1070                         start_pfn += lvl_to_nr_pages(large_page);
1071                         pte++;
1072                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073
1074                 domain_flush_cache(domain, first_pte,
1075                                    (void *)pte - (void *)first_pte);
1076
1077         } while (start_pfn && start_pfn <= last_pfn);
1078 }
1079
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081                                struct dma_pte *pte, unsigned long pfn,
1082                                unsigned long start_pfn, unsigned long last_pfn)
1083 {
1084         pfn = max(start_pfn, pfn);
1085         pte = &pte[pfn_level_offset(pfn, level)];
1086
1087         do {
1088                 unsigned long level_pfn;
1089                 struct dma_pte *level_pte;
1090
1091                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1092                         goto next;
1093
1094                 level_pfn = pfn & level_mask(level - 1);
1095                 level_pte = phys_to_virt(dma_pte_addr(pte));
1096
1097                 if (level > 2)
1098                         dma_pte_free_level(domain, level - 1, level_pte,
1099                                            level_pfn, start_pfn, last_pfn);
1100
1101                 /* If range covers entire pagetable, free it */
1102                 if (!(start_pfn > level_pfn ||
1103                       last_pfn < level_pfn + level_size(level) - 1)) {
1104                         dma_clear_pte(pte);
1105                         domain_flush_cache(domain, pte, sizeof(*pte));
1106                         free_pgtable_page(level_pte);
1107                 }
1108 next:
1109                 pfn += level_size(level);
1110         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1111 }
1112
1113 /* free page table pages. last level pte should already be cleared */
1114 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1115                                    unsigned long start_pfn,
1116                                    unsigned long last_pfn)
1117 {
1118         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1119         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1120         BUG_ON(start_pfn > last_pfn);
1121
1122         dma_pte_clear_range(domain, start_pfn, last_pfn);
1123
1124         /* We don't need lock here; nobody else touches the iova range */
1125         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1126                            domain->pgd, 0, start_pfn, last_pfn);
1127
1128         /* free pgd */
1129         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1130                 free_pgtable_page(domain->pgd);
1131                 domain->pgd = NULL;
1132         }
1133 }
1134
1135 /* When a page at a given level is being unlinked from its parent, we don't
1136    need to *modify* it at all. All we need to do is make a list of all the
1137    pages which can be freed just as soon as we've flushed the IOTLB and we
1138    know the hardware page-walk will no longer touch them.
1139    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1140    be freed. */
1141 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1142                                             int level, struct dma_pte *pte,
1143                                             struct page *freelist)
1144 {
1145         struct page *pg;
1146
1147         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1148         pg->freelist = freelist;
1149         freelist = pg;
1150
1151         if (level == 1)
1152                 return freelist;
1153
1154         pte = page_address(pg);
1155         do {
1156                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1157                         freelist = dma_pte_list_pagetables(domain, level - 1,
1158                                                            pte, freelist);
1159                 pte++;
1160         } while (!first_pte_in_page(pte));
1161
1162         return freelist;
1163 }
1164
1165 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1166                                         struct dma_pte *pte, unsigned long pfn,
1167                                         unsigned long start_pfn,
1168                                         unsigned long last_pfn,
1169                                         struct page *freelist)
1170 {
1171         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1172
1173         pfn = max(start_pfn, pfn);
1174         pte = &pte[pfn_level_offset(pfn, level)];
1175
1176         do {
1177                 unsigned long level_pfn;
1178
1179                 if (!dma_pte_present(pte))
1180                         goto next;
1181
1182                 level_pfn = pfn & level_mask(level);
1183
1184                 /* If range covers entire pagetable, free it */
1185                 if (start_pfn <= level_pfn &&
1186                     last_pfn >= level_pfn + level_size(level) - 1) {
1187                         /* These suborbinate page tables are going away entirely. Don't
1188                            bother to clear them; we're just going to *free* them. */
1189                         if (level > 1 && !dma_pte_superpage(pte))
1190                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1191
1192                         dma_clear_pte(pte);
1193                         if (!first_pte)
1194                                 first_pte = pte;
1195                         last_pte = pte;
1196                 } else if (level > 1) {
1197                         /* Recurse down into a level that isn't *entirely* obsolete */
1198                         freelist = dma_pte_clear_level(domain, level - 1,
1199                                                        phys_to_virt(dma_pte_addr(pte)),
1200                                                        level_pfn, start_pfn, last_pfn,
1201                                                        freelist);
1202                 }
1203 next:
1204                 pfn += level_size(level);
1205         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1206
1207         if (first_pte)
1208                 domain_flush_cache(domain, first_pte,
1209                                    (void *)++last_pte - (void *)first_pte);
1210
1211         return freelist;
1212 }
1213
1214 /* We can't just free the pages because the IOMMU may still be walking
1215    the page tables, and may have cached the intermediate levels. The
1216    pages can only be freed after the IOTLB flush has been done. */
1217 static struct page *domain_unmap(struct dmar_domain *domain,
1218                                  unsigned long start_pfn,
1219                                  unsigned long last_pfn)
1220 {
1221         struct page *freelist = NULL;
1222
1223         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1224         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1225         BUG_ON(start_pfn > last_pfn);
1226
1227         /* we don't need lock here; nobody else touches the iova range */
1228         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1229                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1230
1231         /* free pgd */
1232         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1233                 struct page *pgd_page = virt_to_page(domain->pgd);
1234                 pgd_page->freelist = freelist;
1235                 freelist = pgd_page;
1236
1237                 domain->pgd = NULL;
1238         }
1239
1240         return freelist;
1241 }
1242
1243 static void dma_free_pagelist(struct page *freelist)
1244 {
1245         struct page *pg;
1246
1247         while ((pg = freelist)) {
1248                 freelist = pg->freelist;
1249                 free_pgtable_page(page_address(pg));
1250         }
1251 }
1252
1253 /* iommu handling */
1254 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1255 {
1256         struct root_entry *root;
1257         unsigned long flags;
1258
1259         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1260         if (!root) {
1261                 pr_err("Allocating root entry for %s failed\n",
1262                         iommu->name);
1263                 return -ENOMEM;
1264         }
1265
1266         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1267
1268         spin_lock_irqsave(&iommu->lock, flags);
1269         iommu->root_entry = root;
1270         spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272         return 0;
1273 }
1274
1275 static void iommu_set_root_entry(struct intel_iommu *iommu)
1276 {
1277         u64 addr;
1278         u32 sts;
1279         unsigned long flag;
1280
1281         addr = virt_to_phys(iommu->root_entry);
1282         if (ecs_enabled(iommu))
1283                 addr |= DMA_RTADDR_RTT;
1284
1285         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1286         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1287
1288         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1289
1290         /* Make sure hardware complete it */
1291         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1292                       readl, (sts & DMA_GSTS_RTPS), sts);
1293
1294         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1295 }
1296
1297 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1298 {
1299         u32 val;
1300         unsigned long flag;
1301
1302         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1303                 return;
1304
1305         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1306         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1307
1308         /* Make sure hardware complete it */
1309         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1310                       readl, (!(val & DMA_GSTS_WBFS)), val);
1311
1312         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1313 }
1314
1315 /* return value determine if we need a write buffer flush */
1316 static void __iommu_flush_context(struct intel_iommu *iommu,
1317                                   u16 did, u16 source_id, u8 function_mask,
1318                                   u64 type)
1319 {
1320         u64 val = 0;
1321         unsigned long flag;
1322
1323         switch (type) {
1324         case DMA_CCMD_GLOBAL_INVL:
1325                 val = DMA_CCMD_GLOBAL_INVL;
1326                 break;
1327         case DMA_CCMD_DOMAIN_INVL:
1328                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1329                 break;
1330         case DMA_CCMD_DEVICE_INVL:
1331                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1332                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1333                 break;
1334         default:
1335                 BUG();
1336         }
1337         val |= DMA_CCMD_ICC;
1338
1339         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1344                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 }
1348
1349 /* return value determine if we need a write buffer flush */
1350 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1351                                 u64 addr, unsigned int size_order, u64 type)
1352 {
1353         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1354         u64 val = 0, val_iva = 0;
1355         unsigned long flag;
1356
1357         switch (type) {
1358         case DMA_TLB_GLOBAL_FLUSH:
1359                 /* global flush doesn't need set IVA_REG */
1360                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1361                 break;
1362         case DMA_TLB_DSI_FLUSH:
1363                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1364                 break;
1365         case DMA_TLB_PSI_FLUSH:
1366                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1367                 /* IH bit is passed in as part of address */
1368                 val_iva = size_order | addr;
1369                 break;
1370         default:
1371                 BUG();
1372         }
1373         /* Note: set drain read/write */
1374 #if 0
1375         /*
1376          * This is probably to be super secure.. Looks like we can
1377          * ignore it without any impact.
1378          */
1379         if (cap_read_drain(iommu->cap))
1380                 val |= DMA_TLB_READ_DRAIN;
1381 #endif
1382         if (cap_write_drain(iommu->cap))
1383                 val |= DMA_TLB_WRITE_DRAIN;
1384
1385         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1386         /* Note: Only uses first TLB reg currently */
1387         if (val_iva)
1388                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1389         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1390
1391         /* Make sure hardware complete it */
1392         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1393                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1394
1395         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396
1397         /* check IOTLB invalidation granularity */
1398         if (DMA_TLB_IAIG(val) == 0)
1399                 pr_err("Flush IOTLB failed\n");
1400         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1401                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1402                         (unsigned long long)DMA_TLB_IIRG(type),
1403                         (unsigned long long)DMA_TLB_IAIG(val));
1404 }
1405
1406 static struct device_domain_info *
1407 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1408                          u8 bus, u8 devfn)
1409 {
1410         bool found = false;
1411         struct device_domain_info *info;
1412         struct pci_dev *pdev;
1413
1414         assert_spin_locked(&device_domain_lock);
1415
1416         if (!ecap_dev_iotlb_support(iommu->ecap))
1417                 return NULL;
1418
1419         if (!iommu->qi)
1420                 return NULL;
1421
1422         list_for_each_entry(info, &domain->devices, link)
1423                 if (info->iommu == iommu && info->bus == bus &&
1424                     info->devfn == devfn) {
1425                         found = true;
1426                         break;
1427                 }
1428
1429         if (!found || !info->dev || !dev_is_pci(info->dev))
1430                 return NULL;
1431
1432         pdev = to_pci_dev(info->dev);
1433
1434         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1435                 return NULL;
1436
1437         if (!dmar_find_matched_atsr_unit(pdev))
1438                 return NULL;
1439
1440         return info;
1441 }
1442
1443 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1444 {
1445         struct pci_dev *pdev;
1446
1447         if (!info || !dev_is_pci(info->dev))
1448                 return;
1449
1450         pdev = to_pci_dev(info->dev);
1451         if (pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1452                 return;
1453
1454         info->ats.enabled = 1;
1455         info->ats.qdep = pci_ats_queue_depth(pdev);
1456 }
1457
1458 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1459 {
1460         if (!info->ats.enabled)
1461                 return;
1462
1463         pci_disable_ats(to_pci_dev(info->dev));
1464         info->ats.enabled = 0;
1465 }
1466
1467 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1468                                   u64 addr, unsigned mask)
1469 {
1470         u16 sid, qdep;
1471         unsigned long flags;
1472         struct device_domain_info *info;
1473
1474         spin_lock_irqsave(&device_domain_lock, flags);
1475         list_for_each_entry(info, &domain->devices, link) {
1476                 if (!info->ats.enabled)
1477                         continue;
1478
1479                 sid = info->bus << 8 | info->devfn;
1480                 qdep = info->ats.qdep;
1481                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1482         }
1483         spin_unlock_irqrestore(&device_domain_lock, flags);
1484 }
1485
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487                                   struct dmar_domain *domain,
1488                                   unsigned long pfn, unsigned int pages,
1489                                   int ih, int map)
1490 {
1491         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493         u16 did = domain->iommu_did[iommu->seq_id];
1494
1495         BUG_ON(pages == 0);
1496
1497         if (ih)
1498                 ih = 1 << 6;
1499         /*
1500          * Fallback to domain selective flush if no PSI support or the size is
1501          * too big.
1502          * PSI requires page size to be 2 ^ x, and the base address is naturally
1503          * aligned to the size
1504          */
1505         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507                                                 DMA_TLB_DSI_FLUSH);
1508         else
1509                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510                                                 DMA_TLB_PSI_FLUSH);
1511
1512         /*
1513          * In caching mode, changes of pages from non-present to present require
1514          * flush. However, device IOTLB doesn't need to be flushed in this case.
1515          */
1516         if (!cap_caching_mode(iommu->cap) || !map)
1517                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1518                                       addr, mask);
1519 }
1520
1521 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1522 {
1523         u32 pmen;
1524         unsigned long flags;
1525
1526         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1527         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1528         pmen &= ~DMA_PMEN_EPM;
1529         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1530
1531         /* wait for the protected region status bit to clear */
1532         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1533                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1534
1535         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1536 }
1537
1538 static void iommu_enable_translation(struct intel_iommu *iommu)
1539 {
1540         u32 sts;
1541         unsigned long flags;
1542
1543         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1544         iommu->gcmd |= DMA_GCMD_TE;
1545         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1546
1547         /* Make sure hardware complete it */
1548         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1549                       readl, (sts & DMA_GSTS_TES), sts);
1550
1551         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1552 }
1553
1554 static void iommu_disable_translation(struct intel_iommu *iommu)
1555 {
1556         u32 sts;
1557         unsigned long flag;
1558
1559         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1560         iommu->gcmd &= ~DMA_GCMD_TE;
1561         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1562
1563         /* Make sure hardware complete it */
1564         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1565                       readl, (!(sts & DMA_GSTS_TES)), sts);
1566
1567         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1568 }
1569
1570
1571 static int iommu_init_domains(struct intel_iommu *iommu)
1572 {
1573         u32 ndomains, nlongs;
1574         size_t size;
1575
1576         ndomains = cap_ndoms(iommu->cap);
1577         pr_debug("%s: Number of Domains supported <%d>\n",
1578                  iommu->name, ndomains);
1579         nlongs = BITS_TO_LONGS(ndomains);
1580
1581         spin_lock_init(&iommu->lock);
1582
1583         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1584         if (!iommu->domain_ids) {
1585                 pr_err("%s: Allocating domain id array failed\n",
1586                        iommu->name);
1587                 return -ENOMEM;
1588         }
1589
1590         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1591         iommu->domains = kzalloc(size, GFP_KERNEL);
1592
1593         if (iommu->domains) {
1594                 size = 256 * sizeof(struct dmar_domain *);
1595                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1596         }
1597
1598         if (!iommu->domains || !iommu->domains[0]) {
1599                 pr_err("%s: Allocating domain array failed\n",
1600                        iommu->name);
1601                 kfree(iommu->domain_ids);
1602                 kfree(iommu->domains);
1603                 iommu->domain_ids = NULL;
1604                 iommu->domains    = NULL;
1605                 return -ENOMEM;
1606         }
1607
1608
1609
1610         /*
1611          * If Caching mode is set, then invalid translations are tagged
1612          * with domain-id 0, hence we need to pre-allocate it. We also
1613          * use domain-id 0 as a marker for non-allocated domain-id, so
1614          * make sure it is not used for a real domain.
1615          */
1616         set_bit(0, iommu->domain_ids);
1617
1618         return 0;
1619 }
1620
1621 static void disable_dmar_iommu(struct intel_iommu *iommu)
1622 {
1623         struct device_domain_info *info, *tmp;
1624         unsigned long flags;
1625
1626         if (!iommu->domains || !iommu->domain_ids)
1627                 return;
1628
1629         spin_lock_irqsave(&device_domain_lock, flags);
1630         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1631                 struct dmar_domain *domain;
1632
1633                 if (info->iommu != iommu)
1634                         continue;
1635
1636                 if (!info->dev || !info->domain)
1637                         continue;
1638
1639                 domain = info->domain;
1640
1641                 dmar_remove_one_dev_info(domain, info->dev);
1642
1643                 if (!domain_type_is_vm_or_si(domain))
1644                         domain_exit(domain);
1645         }
1646         spin_unlock_irqrestore(&device_domain_lock, flags);
1647
1648         if (iommu->gcmd & DMA_GCMD_TE)
1649                 iommu_disable_translation(iommu);
1650 }
1651
1652 static void free_dmar_iommu(struct intel_iommu *iommu)
1653 {
1654         if ((iommu->domains) && (iommu->domain_ids)) {
1655                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1656                 int i;
1657
1658                 for (i = 0; i < elems; i++)
1659                         kfree(iommu->domains[i]);
1660                 kfree(iommu->domains);
1661                 kfree(iommu->domain_ids);
1662                 iommu->domains = NULL;
1663                 iommu->domain_ids = NULL;
1664         }
1665
1666         g_iommus[iommu->seq_id] = NULL;
1667
1668         /* free context mapping */
1669         free_context_table(iommu);
1670 }
1671
1672 static struct dmar_domain *alloc_domain(int flags)
1673 {
1674         struct dmar_domain *domain;
1675
1676         domain = alloc_domain_mem();
1677         if (!domain)
1678                 return NULL;
1679
1680         memset(domain, 0, sizeof(*domain));
1681         domain->nid = -1;
1682         domain->flags = flags;
1683         INIT_LIST_HEAD(&domain->devices);
1684
1685         return domain;
1686 }
1687
1688 /* Must be called with iommu->lock */
1689 static int domain_attach_iommu(struct dmar_domain *domain,
1690                                struct intel_iommu *iommu)
1691 {
1692         unsigned long ndomains;
1693         int num;
1694
1695         assert_spin_locked(&device_domain_lock);
1696         assert_spin_locked(&iommu->lock);
1697
1698         domain->iommu_refcnt[iommu->seq_id] += 1;
1699         domain->iommu_count += 1;
1700         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1701                 ndomains = cap_ndoms(iommu->cap);
1702                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1703
1704                 if (num >= ndomains) {
1705                         pr_err("%s: No free domain ids\n", iommu->name);
1706                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1707                         domain->iommu_count -= 1;
1708                         return -ENOSPC;
1709                 }
1710
1711                 set_bit(num, iommu->domain_ids);
1712                 set_iommu_domain(iommu, num, domain);
1713
1714                 domain->iommu_did[iommu->seq_id] = num;
1715                 domain->nid                      = iommu->node;
1716
1717                 domain_update_iommu_cap(domain);
1718         }
1719
1720         return 0;
1721 }
1722
1723 static int domain_detach_iommu(struct dmar_domain *domain,
1724                                struct intel_iommu *iommu)
1725 {
1726         int num, count = INT_MAX;
1727
1728         assert_spin_locked(&device_domain_lock);
1729         assert_spin_locked(&iommu->lock);
1730
1731         domain->iommu_refcnt[iommu->seq_id] -= 1;
1732         count = --domain->iommu_count;
1733         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1734                 num = domain->iommu_did[iommu->seq_id];
1735                 clear_bit(num, iommu->domain_ids);
1736                 set_iommu_domain(iommu, num, NULL);
1737
1738                 domain_update_iommu_cap(domain);
1739                 domain->iommu_did[iommu->seq_id] = 0;
1740         }
1741
1742         return count;
1743 }
1744
1745 static struct iova_domain reserved_iova_list;
1746 static struct lock_class_key reserved_rbtree_key;
1747
1748 static int dmar_init_reserved_ranges(void)
1749 {
1750         struct pci_dev *pdev = NULL;
1751         struct iova *iova;
1752         int i;
1753
1754         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1755                         DMA_32BIT_PFN);
1756
1757         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1758                 &reserved_rbtree_key);
1759
1760         /* IOAPIC ranges shouldn't be accessed by DMA */
1761         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1762                 IOVA_PFN(IOAPIC_RANGE_END));
1763         if (!iova) {
1764                 pr_err("Reserve IOAPIC range failed\n");
1765                 return -ENODEV;
1766         }
1767
1768         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1769         for_each_pci_dev(pdev) {
1770                 struct resource *r;
1771
1772                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1773                         r = &pdev->resource[i];
1774                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1775                                 continue;
1776                         iova = reserve_iova(&reserved_iova_list,
1777                                             IOVA_PFN(r->start),
1778                                             IOVA_PFN(r->end));
1779                         if (!iova) {
1780                                 pr_err("Reserve iova failed\n");
1781                                 return -ENODEV;
1782                         }
1783                 }
1784         }
1785         return 0;
1786 }
1787
1788 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1789 {
1790         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1791 }
1792
1793 static inline int guestwidth_to_adjustwidth(int gaw)
1794 {
1795         int agaw;
1796         int r = (gaw - 12) % 9;
1797
1798         if (r == 0)
1799                 agaw = gaw;
1800         else
1801                 agaw = gaw + 9 - r;
1802         if (agaw > 64)
1803                 agaw = 64;
1804         return agaw;
1805 }
1806
1807 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1808                        int guest_width)
1809 {
1810         int adjust_width, agaw;
1811         unsigned long sagaw;
1812
1813         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1814                         DMA_32BIT_PFN);
1815         domain_reserve_special_ranges(domain);
1816
1817         /* calculate AGAW */
1818         if (guest_width > cap_mgaw(iommu->cap))
1819                 guest_width = cap_mgaw(iommu->cap);
1820         domain->gaw = guest_width;
1821         adjust_width = guestwidth_to_adjustwidth(guest_width);
1822         agaw = width_to_agaw(adjust_width);
1823         sagaw = cap_sagaw(iommu->cap);
1824         if (!test_bit(agaw, &sagaw)) {
1825                 /* hardware doesn't support it, choose a bigger one */
1826                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1827                 agaw = find_next_bit(&sagaw, 5, agaw);
1828                 if (agaw >= 5)
1829                         return -ENODEV;
1830         }
1831         domain->agaw = agaw;
1832
1833         if (ecap_coherent(iommu->ecap))
1834                 domain->iommu_coherency = 1;
1835         else
1836                 domain->iommu_coherency = 0;
1837
1838         if (ecap_sc_support(iommu->ecap))
1839                 domain->iommu_snooping = 1;
1840         else
1841                 domain->iommu_snooping = 0;
1842
1843         if (intel_iommu_superpage)
1844                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1845         else
1846                 domain->iommu_superpage = 0;
1847
1848         domain->nid = iommu->node;
1849
1850         /* always allocate the top pgd */
1851         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1852         if (!domain->pgd)
1853                 return -ENOMEM;
1854         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1855         return 0;
1856 }
1857
1858 static void domain_exit(struct dmar_domain *domain)
1859 {
1860         struct page *freelist = NULL;
1861
1862         /* Domain 0 is reserved, so dont process it */
1863         if (!domain)
1864                 return;
1865
1866         /* Flush any lazy unmaps that may reference this domain */
1867         if (!intel_iommu_strict)
1868                 flush_unmaps_timeout(0);
1869
1870         /* Remove associated devices and clear attached or cached domains */
1871         rcu_read_lock();
1872         domain_remove_dev_info(domain);
1873         rcu_read_unlock();
1874
1875         /* destroy iovas */
1876         put_iova_domain(&domain->iovad);
1877
1878         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1879
1880         dma_free_pagelist(freelist);
1881
1882         free_domain_mem(domain);
1883 }
1884
1885 static int domain_context_mapping_one(struct dmar_domain *domain,
1886                                       struct intel_iommu *iommu,
1887                                       u8 bus, u8 devfn)
1888 {
1889         u16 did = domain->iommu_did[iommu->seq_id];
1890         int translation = CONTEXT_TT_MULTI_LEVEL;
1891         struct device_domain_info *info = NULL;
1892         struct context_entry *context;
1893         unsigned long flags;
1894         struct dma_pte *pgd;
1895         int ret, agaw;
1896
1897         WARN_ON(did == 0);
1898
1899         if (hw_pass_through && domain_type_is_si(domain))
1900                 translation = CONTEXT_TT_PASS_THROUGH;
1901
1902         pr_debug("Set context mapping for %02x:%02x.%d\n",
1903                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1904
1905         BUG_ON(!domain->pgd);
1906
1907         spin_lock_irqsave(&device_domain_lock, flags);
1908         spin_lock(&iommu->lock);
1909
1910         ret = -ENOMEM;
1911         context = iommu_context_addr(iommu, bus, devfn, 1);
1912         if (!context)
1913                 goto out_unlock;
1914
1915         ret = 0;
1916         if (context_present(context))
1917                 goto out_unlock;
1918
1919         pgd = domain->pgd;
1920
1921         context_clear_entry(context);
1922         context_set_domain_id(context, did);
1923
1924         /*
1925          * Skip top levels of page tables for iommu which has less agaw
1926          * than default.  Unnecessary for PT mode.
1927          */
1928         if (translation != CONTEXT_TT_PASS_THROUGH) {
1929                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1930                         ret = -ENOMEM;
1931                         pgd = phys_to_virt(dma_pte_addr(pgd));
1932                         if (!dma_pte_present(pgd))
1933                                 goto out_unlock;
1934                 }
1935
1936                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1937                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1938                                      CONTEXT_TT_MULTI_LEVEL;
1939
1940                 context_set_address_root(context, virt_to_phys(pgd));
1941                 context_set_address_width(context, iommu->agaw);
1942         } else {
1943                 /*
1944                  * In pass through mode, AW must be programmed to
1945                  * indicate the largest AGAW value supported by
1946                  * hardware. And ASR is ignored by hardware.
1947                  */
1948                 context_set_address_width(context, iommu->msagaw);
1949         }
1950
1951         context_set_translation_type(context, translation);
1952         context_set_fault_enable(context);
1953         context_set_present(context);
1954         domain_flush_cache(domain, context, sizeof(*context));
1955
1956         /*
1957          * It's a non-present to present mapping. If hardware doesn't cache
1958          * non-present entry we only need to flush the write-buffer. If the
1959          * _does_ cache non-present entries, then it does so in the special
1960          * domain #0, which we have to flush:
1961          */
1962         if (cap_caching_mode(iommu->cap)) {
1963                 iommu->flush.flush_context(iommu, 0,
1964                                            (((u16)bus) << 8) | devfn,
1965                                            DMA_CCMD_MASK_NOBIT,
1966                                            DMA_CCMD_DEVICE_INVL);
1967                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1968         } else {
1969                 iommu_flush_write_buffer(iommu);
1970         }
1971         iommu_enable_dev_iotlb(info);
1972
1973         ret = 0;
1974
1975 out_unlock:
1976         spin_unlock(&iommu->lock);
1977         spin_unlock_irqrestore(&device_domain_lock, flags);
1978
1979         return 0;
1980 }
1981
1982 struct domain_context_mapping_data {
1983         struct dmar_domain *domain;
1984         struct intel_iommu *iommu;
1985 };
1986
1987 static int domain_context_mapping_cb(struct pci_dev *pdev,
1988                                      u16 alias, void *opaque)
1989 {
1990         struct domain_context_mapping_data *data = opaque;
1991
1992         return domain_context_mapping_one(data->domain, data->iommu,
1993                                           PCI_BUS_NUM(alias), alias & 0xff);
1994 }
1995
1996 static int
1997 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1998 {
1999         struct intel_iommu *iommu;
2000         u8 bus, devfn;
2001         struct domain_context_mapping_data data;
2002
2003         iommu = device_to_iommu(dev, &bus, &devfn);
2004         if (!iommu)
2005                 return -ENODEV;
2006
2007         if (!dev_is_pci(dev))
2008                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2009
2010         data.domain = domain;
2011         data.iommu = iommu;
2012
2013         return pci_for_each_dma_alias(to_pci_dev(dev),
2014                                       &domain_context_mapping_cb, &data);
2015 }
2016
2017 static int domain_context_mapped_cb(struct pci_dev *pdev,
2018                                     u16 alias, void *opaque)
2019 {
2020         struct intel_iommu *iommu = opaque;
2021
2022         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2023 }
2024
2025 static int domain_context_mapped(struct device *dev)
2026 {
2027         struct intel_iommu *iommu;
2028         u8 bus, devfn;
2029
2030         iommu = device_to_iommu(dev, &bus, &devfn);
2031         if (!iommu)
2032                 return -ENODEV;
2033
2034         if (!dev_is_pci(dev))
2035                 return device_context_mapped(iommu, bus, devfn);
2036
2037         return !pci_for_each_dma_alias(to_pci_dev(dev),
2038                                        domain_context_mapped_cb, iommu);
2039 }
2040
2041 /* Returns a number of VTD pages, but aligned to MM page size */
2042 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2043                                             size_t size)
2044 {
2045         host_addr &= ~PAGE_MASK;
2046         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2047 }
2048
2049 /* Return largest possible superpage level for a given mapping */
2050 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2051                                           unsigned long iov_pfn,
2052                                           unsigned long phy_pfn,
2053                                           unsigned long pages)
2054 {
2055         int support, level = 1;
2056         unsigned long pfnmerge;
2057
2058         support = domain->iommu_superpage;
2059
2060         /* To use a large page, the virtual *and* physical addresses
2061            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2062            of them will mean we have to use smaller pages. So just
2063            merge them and check both at once. */
2064         pfnmerge = iov_pfn | phy_pfn;
2065
2066         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2067                 pages >>= VTD_STRIDE_SHIFT;
2068                 if (!pages)
2069                         break;
2070                 pfnmerge >>= VTD_STRIDE_SHIFT;
2071                 level++;
2072                 support--;
2073         }
2074         return level;
2075 }
2076
2077 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2078                             struct scatterlist *sg, unsigned long phys_pfn,
2079                             unsigned long nr_pages, int prot)
2080 {
2081         struct dma_pte *first_pte = NULL, *pte = NULL;
2082         phys_addr_t uninitialized_var(pteval);
2083         unsigned long sg_res = 0;
2084         unsigned int largepage_lvl = 0;
2085         unsigned long lvl_pages = 0;
2086
2087         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2088
2089         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2090                 return -EINVAL;
2091
2092         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2093
2094         if (!sg) {
2095                 sg_res = nr_pages;
2096                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2097         }
2098
2099         while (nr_pages > 0) {
2100                 uint64_t tmp;
2101
2102                 if (!sg_res) {
2103                         sg_res = aligned_nrpages(sg->offset, sg->length);
2104                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2105                         sg->dma_length = sg->length;
2106                         pteval = (sg_phys(sg) & PAGE_MASK) | prot;
2107                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2108                 }
2109
2110                 if (!pte) {
2111                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2112
2113                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2114                         if (!pte)
2115                                 return -ENOMEM;
2116                         /* It is large page*/
2117                         if (largepage_lvl > 1) {
2118                                 pteval |= DMA_PTE_LARGE_PAGE;
2119                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2120                                 /*
2121                                  * Ensure that old small page tables are
2122                                  * removed to make room for superpage,
2123                                  * if they exist.
2124                                  */
2125                                 dma_pte_free_pagetable(domain, iov_pfn,
2126                                                        iov_pfn + lvl_pages - 1);
2127                         } else {
2128                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2129                         }
2130
2131                 }
2132                 /* We don't need lock here, nobody else
2133                  * touches the iova range
2134                  */
2135                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2136                 if (tmp) {
2137                         static int dumps = 5;
2138                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2139                                 iov_pfn, tmp, (unsigned long long)pteval);
2140                         if (dumps) {
2141                                 dumps--;
2142                                 debug_dma_dump_mappings(NULL);
2143                         }
2144                         WARN_ON(1);
2145                 }
2146
2147                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2148
2149                 BUG_ON(nr_pages < lvl_pages);
2150                 BUG_ON(sg_res < lvl_pages);
2151
2152                 nr_pages -= lvl_pages;
2153                 iov_pfn += lvl_pages;
2154                 phys_pfn += lvl_pages;
2155                 pteval += lvl_pages * VTD_PAGE_SIZE;
2156                 sg_res -= lvl_pages;
2157
2158                 /* If the next PTE would be the first in a new page, then we
2159                    need to flush the cache on the entries we've just written.
2160                    And then we'll need to recalculate 'pte', so clear it and
2161                    let it get set again in the if (!pte) block above.
2162
2163                    If we're done (!nr_pages) we need to flush the cache too.
2164
2165                    Also if we've been setting superpages, we may need to
2166                    recalculate 'pte' and switch back to smaller pages for the
2167                    end of the mapping, if the trailing size is not enough to
2168                    use another superpage (i.e. sg_res < lvl_pages). */
2169                 pte++;
2170                 if (!nr_pages || first_pte_in_page(pte) ||
2171                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2172                         domain_flush_cache(domain, first_pte,
2173                                            (void *)pte - (void *)first_pte);
2174                         pte = NULL;
2175                 }
2176
2177                 if (!sg_res && nr_pages)
2178                         sg = sg_next(sg);
2179         }
2180         return 0;
2181 }
2182
2183 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184                                     struct scatterlist *sg, unsigned long nr_pages,
2185                                     int prot)
2186 {
2187         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2188 }
2189
2190 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2191                                      unsigned long phys_pfn, unsigned long nr_pages,
2192                                      int prot)
2193 {
2194         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2195 }
2196
2197 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2198 {
2199         if (!iommu)
2200                 return;
2201
2202         clear_context_table(iommu, bus, devfn);
2203         iommu->flush.flush_context(iommu, 0, 0, 0,
2204                                            DMA_CCMD_GLOBAL_INVL);
2205         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2206 }
2207
2208 static inline void unlink_domain_info(struct device_domain_info *info)
2209 {
2210         assert_spin_locked(&device_domain_lock);
2211         list_del(&info->link);
2212         list_del(&info->global);
2213         if (info->dev)
2214                 info->dev->archdata.iommu = NULL;
2215 }
2216
2217 static void domain_remove_dev_info(struct dmar_domain *domain)
2218 {
2219         struct device_domain_info *info, *tmp;
2220         unsigned long flags;
2221
2222         spin_lock_irqsave(&device_domain_lock, flags);
2223         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2224                 __dmar_remove_one_dev_info(info);
2225         spin_unlock_irqrestore(&device_domain_lock, flags);
2226 }
2227
2228 /*
2229  * find_domain
2230  * Note: we use struct device->archdata.iommu stores the info
2231  */
2232 static struct dmar_domain *find_domain(struct device *dev)
2233 {
2234         struct device_domain_info *info;
2235
2236         /* No lock here, assumes no domain exit in normal case */
2237         info = dev->archdata.iommu;
2238         if (info)
2239                 return info->domain;
2240         return NULL;
2241 }
2242
2243 static inline struct device_domain_info *
2244 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2245 {
2246         struct device_domain_info *info;
2247
2248         list_for_each_entry(info, &device_domain_list, global)
2249                 if (info->iommu->segment == segment && info->bus == bus &&
2250                     info->devfn == devfn)
2251                         return info;
2252
2253         return NULL;
2254 }
2255
2256 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2257                                                     int bus, int devfn,
2258                                                     struct device *dev,
2259                                                     struct dmar_domain *domain)
2260 {
2261         struct dmar_domain *found = NULL;
2262         struct device_domain_info *info;
2263         unsigned long flags;
2264         int ret;
2265
2266         info = alloc_devinfo_mem();
2267         if (!info)
2268                 return NULL;
2269
2270         info->bus = bus;
2271         info->devfn = devfn;
2272         info->ats.enabled = 0;
2273         info->ats.qdep = 0;
2274         info->dev = dev;
2275         info->domain = domain;
2276         info->iommu = iommu;
2277
2278         spin_lock_irqsave(&device_domain_lock, flags);
2279         if (dev)
2280                 found = find_domain(dev);
2281
2282         if (!found) {
2283                 struct device_domain_info *info2;
2284                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2285                 if (info2) {
2286                         found      = info2->domain;
2287                         info2->dev = dev;
2288                 }
2289         }
2290
2291         if (found) {
2292                 spin_unlock_irqrestore(&device_domain_lock, flags);
2293                 free_devinfo_mem(info);
2294                 /* Caller must free the original domain */
2295                 return found;
2296         }
2297
2298         spin_lock(&iommu->lock);
2299         ret = domain_attach_iommu(domain, iommu);
2300         spin_unlock(&iommu->lock);
2301
2302         if (ret) {
2303                 spin_unlock_irqrestore(&device_domain_lock, flags);
2304                 free_devinfo_mem(info);
2305                 return NULL;
2306         }
2307
2308         list_add(&info->link, &domain->devices);
2309         list_add(&info->global, &device_domain_list);
2310         if (dev)
2311                 dev->archdata.iommu = info;
2312         spin_unlock_irqrestore(&device_domain_lock, flags);
2313
2314         if (dev && domain_context_mapping(domain, dev)) {
2315                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2316                 dmar_remove_one_dev_info(domain, dev);
2317                 return NULL;
2318         }
2319
2320         return domain;
2321 }
2322
2323 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2324 {
2325         *(u16 *)opaque = alias;
2326         return 0;
2327 }
2328
2329 /* domain is initialized */
2330 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2331 {
2332         struct device_domain_info *info = NULL;
2333         struct dmar_domain *domain, *tmp;
2334         struct intel_iommu *iommu;
2335         u16 req_id, dma_alias;
2336         unsigned long flags;
2337         u8 bus, devfn;
2338
2339         domain = find_domain(dev);
2340         if (domain)
2341                 return domain;
2342
2343         iommu = device_to_iommu(dev, &bus, &devfn);
2344         if (!iommu)
2345                 return NULL;
2346
2347         req_id = ((u16)bus << 8) | devfn;
2348
2349         if (dev_is_pci(dev)) {
2350                 struct pci_dev *pdev = to_pci_dev(dev);
2351
2352                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2353
2354                 spin_lock_irqsave(&device_domain_lock, flags);
2355                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2356                                                       PCI_BUS_NUM(dma_alias),
2357                                                       dma_alias & 0xff);
2358                 if (info) {
2359                         iommu = info->iommu;
2360                         domain = info->domain;
2361                 }
2362                 spin_unlock_irqrestore(&device_domain_lock, flags);
2363
2364                 /* DMA alias already has a domain, uses it */
2365                 if (info)
2366                         goto found_domain;
2367         }
2368
2369         /* Allocate and initialize new domain for the device */
2370         domain = alloc_domain(0);
2371         if (!domain)
2372                 return NULL;
2373         if (domain_init(domain, iommu, gaw)) {
2374                 domain_exit(domain);
2375                 return NULL;
2376         }
2377
2378         /* register PCI DMA alias device */
2379         if (req_id != dma_alias && dev_is_pci(dev)) {
2380                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2381                                                dma_alias & 0xff, NULL, domain);
2382
2383                 if (!tmp || tmp != domain) {
2384                         domain_exit(domain);
2385                         domain = tmp;
2386                 }
2387
2388                 if (!domain)
2389                         return NULL;
2390         }
2391
2392 found_domain:
2393         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2394
2395         if (!tmp || tmp != domain) {
2396                 domain_exit(domain);
2397                 domain = tmp;
2398         }
2399
2400         return domain;
2401 }
2402
2403 static int iommu_identity_mapping;
2404 #define IDENTMAP_ALL            1
2405 #define IDENTMAP_GFX            2
2406 #define IDENTMAP_AZALIA         4
2407
2408 static int iommu_domain_identity_map(struct dmar_domain *domain,
2409                                      unsigned long long start,
2410                                      unsigned long long end)
2411 {
2412         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2413         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2414
2415         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2416                           dma_to_mm_pfn(last_vpfn))) {
2417                 pr_err("Reserving iova failed\n");
2418                 return -ENOMEM;
2419         }
2420
2421         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2422         /*
2423          * RMRR range might have overlap with physical memory range,
2424          * clear it first
2425          */
2426         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2427
2428         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2429                                   last_vpfn - first_vpfn + 1,
2430                                   DMA_PTE_READ|DMA_PTE_WRITE);
2431 }
2432
2433 static int iommu_prepare_identity_map(struct device *dev,
2434                                       unsigned long long start,
2435                                       unsigned long long end)
2436 {
2437         struct dmar_domain *domain;
2438         int ret;
2439
2440         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2441         if (!domain)
2442                 return -ENOMEM;
2443
2444         /* For _hardware_ passthrough, don't bother. But for software
2445            passthrough, we do it anyway -- it may indicate a memory
2446            range which is reserved in E820, so which didn't get set
2447            up to start with in si_domain */
2448         if (domain == si_domain && hw_pass_through) {
2449                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2450                         dev_name(dev), start, end);
2451                 return 0;
2452         }
2453
2454         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2455                 dev_name(dev), start, end);
2456
2457         if (end < start) {
2458                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2459                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2460                         dmi_get_system_info(DMI_BIOS_VENDOR),
2461                         dmi_get_system_info(DMI_BIOS_VERSION),
2462                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2463                 ret = -EIO;
2464                 goto error;
2465         }
2466
2467         if (end >> agaw_to_width(domain->agaw)) {
2468                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2469                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2470                      agaw_to_width(domain->agaw),
2471                      dmi_get_system_info(DMI_BIOS_VENDOR),
2472                      dmi_get_system_info(DMI_BIOS_VERSION),
2473                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2474                 ret = -EIO;
2475                 goto error;
2476         }
2477
2478         ret = iommu_domain_identity_map(domain, start, end);
2479         if (ret)
2480                 goto error;
2481
2482         return 0;
2483
2484  error:
2485         domain_exit(domain);
2486         return ret;
2487 }
2488
2489 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2490                                          struct device *dev)
2491 {
2492         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2493                 return 0;
2494         return iommu_prepare_identity_map(dev, rmrr->base_address,
2495                                           rmrr->end_address);
2496 }
2497
2498 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2499 static inline void iommu_prepare_isa(void)
2500 {
2501         struct pci_dev *pdev;
2502         int ret;
2503
2504         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2505         if (!pdev)
2506                 return;
2507
2508         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2509         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2510
2511         if (ret)
2512                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2513
2514         pci_dev_put(pdev);
2515 }
2516 #else
2517 static inline void iommu_prepare_isa(void)
2518 {
2519         return;
2520 }
2521 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2522
2523 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2524
2525 static int __init si_domain_init(int hw)
2526 {
2527         int nid, ret = 0;
2528
2529         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2530         if (!si_domain)
2531                 return -EFAULT;
2532
2533         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2534                 domain_exit(si_domain);
2535                 return -EFAULT;
2536         }
2537
2538         pr_debug("Identity mapping domain allocated\n");
2539
2540         if (hw)
2541                 return 0;
2542
2543         for_each_online_node(nid) {
2544                 unsigned long start_pfn, end_pfn;
2545                 int i;
2546
2547                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2548                         ret = iommu_domain_identity_map(si_domain,
2549                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2550                         if (ret)
2551                                 return ret;
2552                 }
2553         }
2554
2555         return 0;
2556 }
2557
2558 static int identity_mapping(struct device *dev)
2559 {
2560         struct device_domain_info *info;
2561
2562         if (likely(!iommu_identity_mapping))
2563                 return 0;
2564
2565         info = dev->archdata.iommu;
2566         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2567                 return (info->domain == si_domain);
2568
2569         return 0;
2570 }
2571
2572 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2573 {
2574         struct dmar_domain *ndomain;
2575         struct intel_iommu *iommu;
2576         u8 bus, devfn;
2577
2578         iommu = device_to_iommu(dev, &bus, &devfn);
2579         if (!iommu)
2580                 return -ENODEV;
2581
2582         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2583         if (ndomain != domain)
2584                 return -EBUSY;
2585
2586         return 0;
2587 }
2588
2589 static bool device_has_rmrr(struct device *dev)
2590 {
2591         struct dmar_rmrr_unit *rmrr;
2592         struct device *tmp;
2593         int i;
2594
2595         rcu_read_lock();
2596         for_each_rmrr_units(rmrr) {
2597                 /*
2598                  * Return TRUE if this RMRR contains the device that
2599                  * is passed in.
2600                  */
2601                 for_each_active_dev_scope(rmrr->devices,
2602                                           rmrr->devices_cnt, i, tmp)
2603                         if (tmp == dev) {
2604                                 rcu_read_unlock();
2605                                 return true;
2606                         }
2607         }
2608         rcu_read_unlock();
2609         return false;
2610 }
2611
2612 /*
2613  * There are a couple cases where we need to restrict the functionality of
2614  * devices associated with RMRRs.  The first is when evaluating a device for
2615  * identity mapping because problems exist when devices are moved in and out
2616  * of domains and their respective RMRR information is lost.  This means that
2617  * a device with associated RMRRs will never be in a "passthrough" domain.
2618  * The second is use of the device through the IOMMU API.  This interface
2619  * expects to have full control of the IOVA space for the device.  We cannot
2620  * satisfy both the requirement that RMRR access is maintained and have an
2621  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2622  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2623  * We therefore prevent devices associated with an RMRR from participating in
2624  * the IOMMU API, which eliminates them from device assignment.
2625  *
2626  * In both cases we assume that PCI USB devices with RMRRs have them largely
2627  * for historical reasons and that the RMRR space is not actively used post
2628  * boot.  This exclusion may change if vendors begin to abuse it.
2629  *
2630  * The same exception is made for graphics devices, with the requirement that
2631  * any use of the RMRR regions will be torn down before assigning the device
2632  * to a guest.
2633  */
2634 static bool device_is_rmrr_locked(struct device *dev)
2635 {
2636         if (!device_has_rmrr(dev))
2637                 return false;
2638
2639         if (dev_is_pci(dev)) {
2640                 struct pci_dev *pdev = to_pci_dev(dev);
2641
2642                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2643                         return false;
2644         }
2645
2646         return true;
2647 }
2648
2649 static int iommu_should_identity_map(struct device *dev, int startup)
2650 {
2651
2652         if (dev_is_pci(dev)) {
2653                 struct pci_dev *pdev = to_pci_dev(dev);
2654
2655                 if (device_is_rmrr_locked(dev))
2656                         return 0;
2657
2658                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2659                         return 1;
2660
2661                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2662                         return 1;
2663
2664                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2665                         return 0;
2666
2667                 /*
2668                  * We want to start off with all devices in the 1:1 domain, and
2669                  * take them out later if we find they can't access all of memory.
2670                  *
2671                  * However, we can't do this for PCI devices behind bridges,
2672                  * because all PCI devices behind the same bridge will end up
2673                  * with the same source-id on their transactions.
2674                  *
2675                  * Practically speaking, we can't change things around for these
2676                  * devices at run-time, because we can't be sure there'll be no
2677                  * DMA transactions in flight for any of their siblings.
2678                  *
2679                  * So PCI devices (unless they're on the root bus) as well as
2680                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2681                  * the 1:1 domain, just in _case_ one of their siblings turns out
2682                  * not to be able to map all of memory.
2683                  */
2684                 if (!pci_is_pcie(pdev)) {
2685                         if (!pci_is_root_bus(pdev->bus))
2686                                 return 0;
2687                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2688                                 return 0;
2689                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2690                         return 0;
2691         } else {
2692                 if (device_has_rmrr(dev))
2693                         return 0;
2694         }
2695
2696         /*
2697          * At boot time, we don't yet know if devices will be 64-bit capable.
2698          * Assume that they will — if they turn out not to be, then we can
2699          * take them out of the 1:1 domain later.
2700          */
2701         if (!startup) {
2702                 /*
2703                  * If the device's dma_mask is less than the system's memory
2704                  * size then this is not a candidate for identity mapping.
2705                  */
2706                 u64 dma_mask = *dev->dma_mask;
2707
2708                 if (dev->coherent_dma_mask &&
2709                     dev->coherent_dma_mask < dma_mask)
2710                         dma_mask = dev->coherent_dma_mask;
2711
2712                 return dma_mask >= dma_get_required_mask(dev);
2713         }
2714
2715         return 1;
2716 }
2717
2718 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2719 {
2720         int ret;
2721
2722         if (!iommu_should_identity_map(dev, 1))
2723                 return 0;
2724
2725         ret = domain_add_dev_info(si_domain, dev);
2726         if (!ret)
2727                 pr_info("%s identity mapping for device %s\n",
2728                         hw ? "Hardware" : "Software", dev_name(dev));
2729         else if (ret == -ENODEV)
2730                 /* device not associated with an iommu */
2731                 ret = 0;
2732
2733         return ret;
2734 }
2735
2736
2737 static int __init iommu_prepare_static_identity_mapping(int hw)
2738 {
2739         struct pci_dev *pdev = NULL;
2740         struct dmar_drhd_unit *drhd;
2741         struct intel_iommu *iommu;
2742         struct device *dev;
2743         int i;
2744         int ret = 0;
2745
2746         for_each_pci_dev(pdev) {
2747                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2748                 if (ret)
2749                         return ret;
2750         }
2751
2752         for_each_active_iommu(iommu, drhd)
2753                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2754                         struct acpi_device_physical_node *pn;
2755                         struct acpi_device *adev;
2756
2757                         if (dev->bus != &acpi_bus_type)
2758                                 continue;
2759
2760                         adev= to_acpi_device(dev);
2761                         mutex_lock(&adev->physical_node_lock);
2762                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2763                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2764                                 if (ret)
2765                                         break;
2766                         }
2767                         mutex_unlock(&adev->physical_node_lock);
2768                         if (ret)
2769                                 return ret;
2770                 }
2771
2772         return 0;
2773 }
2774
2775 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2776 {
2777         /*
2778          * Start from the sane iommu hardware state.
2779          * If the queued invalidation is already initialized by us
2780          * (for example, while enabling interrupt-remapping) then
2781          * we got the things already rolling from a sane state.
2782          */
2783         if (!iommu->qi) {
2784                 /*
2785                  * Clear any previous faults.
2786                  */
2787                 dmar_fault(-1, iommu);
2788                 /*
2789                  * Disable queued invalidation if supported and already enabled
2790                  * before OS handover.
2791                  */
2792                 dmar_disable_qi(iommu);
2793         }
2794
2795         if (dmar_enable_qi(iommu)) {
2796                 /*
2797                  * Queued Invalidate not enabled, use Register Based Invalidate
2798                  */
2799                 iommu->flush.flush_context = __iommu_flush_context;
2800                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2801                 pr_info("%s: Using Register based invalidation\n",
2802                         iommu->name);
2803         } else {
2804                 iommu->flush.flush_context = qi_flush_context;
2805                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2806                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2807         }
2808 }
2809
2810 static int copy_context_table(struct intel_iommu *iommu,
2811                               struct root_entry __iomem *old_re,
2812                               struct context_entry **tbl,
2813                               int bus, bool ext)
2814 {
2815         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2816         struct context_entry __iomem *old_ce = NULL;
2817         struct context_entry *new_ce = NULL, ce;
2818         struct root_entry re;
2819         phys_addr_t old_ce_phys;
2820
2821         tbl_idx = ext ? bus * 2 : bus;
2822         memcpy_fromio(&re, old_re, sizeof(re));
2823
2824         for (devfn = 0; devfn < 256; devfn++) {
2825                 /* First calculate the correct index */
2826                 idx = (ext ? devfn * 2 : devfn) % 256;
2827
2828                 if (idx == 0) {
2829                         /* First save what we may have and clean up */
2830                         if (new_ce) {
2831                                 tbl[tbl_idx] = new_ce;
2832                                 __iommu_flush_cache(iommu, new_ce,
2833                                                     VTD_PAGE_SIZE);
2834                                 pos = 1;
2835                         }
2836
2837                         if (old_ce)
2838                                 iounmap(old_ce);
2839
2840                         ret = 0;
2841                         if (devfn < 0x80)
2842                                 old_ce_phys = root_entry_lctp(&re);
2843                         else
2844                                 old_ce_phys = root_entry_uctp(&re);
2845
2846                         if (!old_ce_phys) {
2847                                 if (ext && devfn == 0) {
2848                                         /* No LCTP, try UCTP */
2849                                         devfn = 0x7f;
2850                                         continue;
2851                                 } else {
2852                                         goto out;
2853                                 }
2854                         }
2855
2856                         ret = -ENOMEM;
2857                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2858                         if (!old_ce)
2859                                 goto out;
2860
2861                         new_ce = alloc_pgtable_page(iommu->node);
2862                         if (!new_ce)
2863                                 goto out_unmap;
2864
2865                         ret = 0;
2866                 }
2867
2868                 /* Now copy the context entry */
2869                 memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
2870
2871                 if (!__context_present(&ce))
2872                         continue;
2873
2874                 did = context_domain_id(&ce);
2875                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2876                         set_bit(did, iommu->domain_ids);
2877
2878                 /*
2879                  * We need a marker for copied context entries. This
2880                  * marker needs to work for the old format as well as
2881                  * for extended context entries.
2882                  *
2883                  * Bit 67 of the context entry is used. In the old
2884                  * format this bit is available to software, in the
2885                  * extended format it is the PGE bit, but PGE is ignored
2886                  * by HW if PASIDs are disabled (and thus still
2887                  * available).
2888                  *
2889                  * So disable PASIDs first and then mark the entry
2890                  * copied. This means that we don't copy PASID
2891                  * translations from the old kernel, but this is fine as
2892                  * faults there are not fatal.
2893                  */
2894                 context_clear_pasid_enable(&ce);
2895                 context_set_copied(&ce);
2896
2897                 new_ce[idx] = ce;
2898         }
2899
2900         tbl[tbl_idx + pos] = new_ce;
2901
2902         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2903
2904 out_unmap:
2905         iounmap(old_ce);
2906
2907 out:
2908         return ret;
2909 }
2910
2911 static int copy_translation_tables(struct intel_iommu *iommu)
2912 {
2913         struct root_entry __iomem *old_rt;
2914         struct context_entry **ctxt_tbls;
2915         phys_addr_t old_rt_phys;
2916         int ctxt_table_entries;
2917         unsigned long flags;
2918         u64 rtaddr_reg;
2919         int bus, ret;
2920         bool new_ext, ext;
2921
2922         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2923         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2924         new_ext    = !!ecap_ecs(iommu->ecap);
2925
2926         /*
2927          * The RTT bit can only be changed when translation is disabled,
2928          * but disabling translation means to open a window for data
2929          * corruption. So bail out and don't copy anything if we would
2930          * have to change the bit.
2931          */
2932         if (new_ext != ext)
2933                 return -EINVAL;
2934
2935         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2936         if (!old_rt_phys)
2937                 return -EINVAL;
2938
2939         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2940         if (!old_rt)
2941                 return -ENOMEM;
2942
2943         /* This is too big for the stack - allocate it from slab */
2944         ctxt_table_entries = ext ? 512 : 256;
2945         ret = -ENOMEM;
2946         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2947         if (!ctxt_tbls)
2948                 goto out_unmap;
2949
2950         for (bus = 0; bus < 256; bus++) {
2951                 ret = copy_context_table(iommu, &old_rt[bus],
2952                                          ctxt_tbls, bus, ext);
2953                 if (ret) {
2954                         pr_err("%s: Failed to copy context table for bus %d\n",
2955                                 iommu->name, bus);
2956                         continue;
2957                 }
2958         }
2959
2960         spin_lock_irqsave(&iommu->lock, flags);
2961
2962         /* Context tables are copied, now write them to the root_entry table */
2963         for (bus = 0; bus < 256; bus++) {
2964                 int idx = ext ? bus * 2 : bus;
2965                 u64 val;
2966
2967                 if (ctxt_tbls[idx]) {
2968                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2969                         iommu->root_entry[bus].lo = val;
2970                 }
2971
2972                 if (!ext || !ctxt_tbls[idx + 1])
2973                         continue;
2974
2975                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2976                 iommu->root_entry[bus].hi = val;
2977         }
2978
2979         spin_unlock_irqrestore(&iommu->lock, flags);
2980
2981         kfree(ctxt_tbls);
2982
2983         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2984
2985         ret = 0;
2986
2987 out_unmap:
2988         iounmap(old_rt);
2989
2990         return ret;
2991 }
2992
2993 static int __init init_dmars(void)
2994 {
2995         struct dmar_drhd_unit *drhd;
2996         struct dmar_rmrr_unit *rmrr;
2997         bool copied_tables = false;
2998         struct device *dev;
2999         struct intel_iommu *iommu;
3000         int i, ret;
3001
3002         /*
3003          * for each drhd
3004          *    allocate root
3005          *    initialize and program root entry to not present
3006          * endfor
3007          */
3008         for_each_drhd_unit(drhd) {
3009                 /*
3010                  * lock not needed as this is only incremented in the single
3011                  * threaded kernel __init code path all other access are read
3012                  * only
3013                  */
3014                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3015                         g_num_of_iommus++;
3016                         continue;
3017                 }
3018                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3019         }
3020
3021         /* Preallocate enough resources for IOMMU hot-addition */
3022         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3023                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3024
3025         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3026                         GFP_KERNEL);
3027         if (!g_iommus) {
3028                 pr_err("Allocating global iommu array failed\n");
3029                 ret = -ENOMEM;
3030                 goto error;
3031         }
3032
3033         deferred_flush = kzalloc(g_num_of_iommus *
3034                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3035         if (!deferred_flush) {
3036                 ret = -ENOMEM;
3037                 goto free_g_iommus;
3038         }
3039
3040         for_each_active_iommu(iommu, drhd) {
3041                 g_iommus[iommu->seq_id] = iommu;
3042
3043                 intel_iommu_init_qi(iommu);
3044
3045                 ret = iommu_init_domains(iommu);
3046                 if (ret)
3047                         goto free_iommu;
3048
3049                 init_translation_status(iommu);
3050
3051                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3052                         iommu_disable_translation(iommu);
3053                         clear_translation_pre_enabled(iommu);
3054                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3055                                 iommu->name);
3056                 }
3057
3058                 /*
3059                  * TBD:
3060                  * we could share the same root & context tables
3061                  * among all IOMMU's. Need to Split it later.
3062                  */
3063                 ret = iommu_alloc_root_entry(iommu);
3064                 if (ret)
3065                         goto free_iommu;
3066
3067                 if (translation_pre_enabled(iommu)) {
3068                         pr_info("Translation already enabled - trying to copy translation structures\n");
3069
3070                         ret = copy_translation_tables(iommu);
3071                         if (ret) {
3072                                 /*
3073                                  * We found the IOMMU with translation
3074                                  * enabled - but failed to copy over the
3075                                  * old root-entry table. Try to proceed
3076                                  * by disabling translation now and
3077                                  * allocating a clean root-entry table.
3078                                  * This might cause DMAR faults, but
3079                                  * probably the dump will still succeed.
3080                                  */
3081                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3082                                        iommu->name);
3083                                 iommu_disable_translation(iommu);
3084                                 clear_translation_pre_enabled(iommu);
3085                         } else {
3086                                 pr_info("Copied translation tables from previous kernel for %s\n",
3087                                         iommu->name);
3088                                 copied_tables = true;
3089                         }
3090                 }
3091
3092                 iommu_flush_write_buffer(iommu);
3093                 iommu_set_root_entry(iommu);
3094                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3095                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3096
3097                 if (!ecap_pass_through(iommu->ecap))
3098                         hw_pass_through = 0;
3099         }
3100
3101         if (iommu_pass_through)
3102                 iommu_identity_mapping |= IDENTMAP_ALL;
3103
3104 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3105         iommu_identity_mapping |= IDENTMAP_GFX;
3106 #endif
3107
3108         if (iommu_identity_mapping) {
3109                 ret = si_domain_init(hw_pass_through);
3110                 if (ret)
3111                         goto free_iommu;
3112         }
3113
3114         check_tylersburg_isoch();
3115
3116         /*
3117          * If we copied translations from a previous kernel in the kdump
3118          * case, we can not assign the devices to domains now, as that
3119          * would eliminate the old mappings. So skip this part and defer
3120          * the assignment to device driver initialization time.
3121          */
3122         if (copied_tables)
3123                 goto domains_done;
3124
3125         /*
3126          * If pass through is not set or not enabled, setup context entries for
3127          * identity mappings for rmrr, gfx, and isa and may fall back to static
3128          * identity mapping if iommu_identity_mapping is set.
3129          */
3130         if (iommu_identity_mapping) {
3131                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3132                 if (ret) {
3133                         pr_crit("Failed to setup IOMMU pass-through\n");
3134                         goto free_iommu;
3135                 }
3136         }
3137         /*
3138          * For each rmrr
3139          *   for each dev attached to rmrr
3140          *   do
3141          *     locate drhd for dev, alloc domain for dev
3142          *     allocate free domain
3143          *     allocate page table entries for rmrr
3144          *     if context not allocated for bus
3145          *           allocate and init context
3146          *           set present in root table for this bus
3147          *     init context with domain, translation etc
3148          *    endfor
3149          * endfor
3150          */
3151         pr_info("Setting RMRR:\n");
3152         for_each_rmrr_units(rmrr) {
3153                 /* some BIOS lists non-exist devices in DMAR table. */
3154                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3155                                           i, dev) {
3156                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3157                         if (ret)
3158                                 pr_err("Mapping reserved region failed\n");
3159                 }
3160         }
3161
3162         iommu_prepare_isa();
3163
3164 domains_done:
3165
3166         /*
3167          * for each drhd
3168          *   enable fault log
3169          *   global invalidate context cache
3170          *   global invalidate iotlb
3171          *   enable translation
3172          */
3173         for_each_iommu(iommu, drhd) {
3174                 if (drhd->ignored) {
3175                         /*
3176                          * we always have to disable PMRs or DMA may fail on
3177                          * this device
3178                          */
3179                         if (force_on)
3180                                 iommu_disable_protect_mem_regions(iommu);
3181                         continue;
3182                 }
3183
3184                 iommu_flush_write_buffer(iommu);
3185
3186                 ret = dmar_set_interrupt(iommu);
3187                 if (ret)
3188                         goto free_iommu;
3189
3190                 if (!translation_pre_enabled(iommu))
3191                         iommu_enable_translation(iommu);
3192
3193                 iommu_disable_protect_mem_regions(iommu);
3194         }
3195
3196         return 0;
3197
3198 free_iommu:
3199         for_each_active_iommu(iommu, drhd) {
3200                 disable_dmar_iommu(iommu);
3201                 free_dmar_iommu(iommu);
3202         }
3203         kfree(deferred_flush);
3204 free_g_iommus:
3205         kfree(g_iommus);
3206 error:
3207         return ret;
3208 }
3209
3210 /* This takes a number of _MM_ pages, not VTD pages */
3211 static struct iova *intel_alloc_iova(struct device *dev,
3212                                      struct dmar_domain *domain,
3213                                      unsigned long nrpages, uint64_t dma_mask)
3214 {
3215         struct iova *iova = NULL;
3216
3217         /* Restrict dma_mask to the width that the iommu can handle */
3218         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3219         /* Ensure we reserve the whole size-aligned region */
3220         nrpages = __roundup_pow_of_two(nrpages);
3221
3222         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3223                 /*
3224                  * First try to allocate an io virtual address in
3225                  * DMA_BIT_MASK(32) and if that fails then try allocating
3226                  * from higher range
3227                  */
3228                 iova = alloc_iova(&domain->iovad, nrpages,
3229                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3230                 if (iova)
3231                         return iova;
3232         }
3233         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3234         if (unlikely(!iova)) {
3235                 pr_err("Allocating %ld-page iova for %s failed",
3236                        nrpages, dev_name(dev));
3237                 return NULL;
3238         }
3239
3240         return iova;
3241 }
3242
3243 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3244 {
3245         struct dmar_domain *domain;
3246
3247         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3248         if (!domain) {
3249                 pr_err("Allocating domain for %s failed\n",
3250                        dev_name(dev));
3251                 return NULL;
3252         }
3253
3254         return domain;
3255 }
3256
3257 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3258 {
3259         struct device_domain_info *info;
3260
3261         /* No lock here, assumes no domain exit in normal case */
3262         info = dev->archdata.iommu;
3263         if (likely(info))
3264                 return info->domain;
3265
3266         return __get_valid_domain_for_dev(dev);
3267 }
3268
3269 /* Check if the dev needs to go through non-identity map and unmap process.*/
3270 static int iommu_no_mapping(struct device *dev)
3271 {
3272         int found;
3273
3274         if (iommu_dummy(dev))
3275                 return 1;
3276
3277         if (!iommu_identity_mapping)
3278                 return 0;
3279
3280         found = identity_mapping(dev);
3281         if (found) {
3282                 if (iommu_should_identity_map(dev, 0))
3283                         return 1;
3284                 else {
3285                         /*
3286                          * 32 bit DMA is removed from si_domain and fall back
3287                          * to non-identity mapping.
3288                          */
3289                         dmar_remove_one_dev_info(si_domain, dev);
3290                         pr_info("32bit %s uses non-identity mapping\n",
3291                                 dev_name(dev));
3292                         return 0;
3293                 }
3294         } else {
3295                 /*
3296                  * In case of a detached 64 bit DMA device from vm, the device
3297                  * is put into si_domain for identity mapping.
3298                  */
3299                 if (iommu_should_identity_map(dev, 0)) {
3300                         int ret;
3301                         ret = domain_add_dev_info(si_domain, dev);
3302                         if (!ret) {
3303                                 pr_info("64bit %s uses identity mapping\n",
3304                                         dev_name(dev));
3305                                 return 1;
3306                         }
3307                 }
3308         }
3309
3310         return 0;
3311 }
3312
3313 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3314                                      size_t size, int dir, u64 dma_mask)
3315 {
3316         struct dmar_domain *domain;
3317         phys_addr_t start_paddr;
3318         struct iova *iova;
3319         int prot = 0;
3320         int ret;
3321         struct intel_iommu *iommu;
3322         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3323
3324         BUG_ON(dir == DMA_NONE);
3325
3326         if (iommu_no_mapping(dev))
3327                 return paddr;
3328
3329         domain = get_valid_domain_for_dev(dev);
3330         if (!domain)
3331                 return 0;
3332
3333         iommu = domain_get_iommu(domain);
3334         size = aligned_nrpages(paddr, size);
3335
3336         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3337         if (!iova)
3338                 goto error;
3339
3340         /*
3341          * Check if DMAR supports zero-length reads on write only
3342          * mappings..
3343          */
3344         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3345                         !cap_zlr(iommu->cap))
3346                 prot |= DMA_PTE_READ;
3347         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3348                 prot |= DMA_PTE_WRITE;
3349         /*
3350          * paddr - (paddr + size) might be partial page, we should map the whole
3351          * page.  Note: if two part of one page are separately mapped, we
3352          * might have two guest_addr mapping to the same host paddr, but this
3353          * is not a big problem
3354          */
3355         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3356                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3357         if (ret)
3358                 goto error;
3359
3360         /* it's a non-present to present mapping. Only flush if caching mode */
3361         if (cap_caching_mode(iommu->cap))
3362                 iommu_flush_iotlb_psi(iommu, domain,
3363                                       mm_to_dma_pfn(iova->pfn_lo),
3364                                       size, 0, 1);
3365         else
3366                 iommu_flush_write_buffer(iommu);
3367
3368         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3369         start_paddr += paddr & ~PAGE_MASK;
3370         return start_paddr;
3371
3372 error:
3373         if (iova)
3374                 __free_iova(&domain->iovad, iova);
3375         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3376                 dev_name(dev), size, (unsigned long long)paddr, dir);
3377         return 0;
3378 }
3379
3380 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3381                                  unsigned long offset, size_t size,
3382                                  enum dma_data_direction dir,
3383                                  struct dma_attrs *attrs)
3384 {
3385         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3386                                   dir, *dev->dma_mask);
3387 }
3388
3389 static void flush_unmaps(void)
3390 {
3391         int i, j;
3392
3393         timer_on = 0;
3394
3395         /* just flush them all */
3396         for (i = 0; i < g_num_of_iommus; i++) {
3397                 struct intel_iommu *iommu = g_iommus[i];
3398                 if (!iommu)
3399                         continue;
3400
3401                 if (!deferred_flush[i].next)
3402                         continue;
3403
3404                 /* In caching mode, global flushes turn emulation expensive */
3405                 if (!cap_caching_mode(iommu->cap))
3406                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3407                                          DMA_TLB_GLOBAL_FLUSH);
3408                 for (j = 0; j < deferred_flush[i].next; j++) {
3409                         unsigned long mask;
3410                         struct iova *iova = deferred_flush[i].iova[j];
3411                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3412
3413                         /* On real hardware multiple invalidations are expensive */
3414                         if (cap_caching_mode(iommu->cap))
3415                                 iommu_flush_iotlb_psi(iommu, domain,
3416                                         iova->pfn_lo, iova_size(iova),
3417                                         !deferred_flush[i].freelist[j], 0);
3418                         else {
3419                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3420                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3421                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3422                         }
3423                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3424                         if (deferred_flush[i].freelist[j])
3425                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3426                 }
3427                 deferred_flush[i].next = 0;
3428         }
3429
3430         list_size = 0;
3431 }
3432
3433 static void flush_unmaps_timeout(unsigned long data)
3434 {
3435         unsigned long flags;
3436
3437         spin_lock_irqsave(&async_umap_flush_lock, flags);
3438         flush_unmaps();
3439         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3440 }
3441
3442 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3443 {
3444         unsigned long flags;
3445         int next, iommu_id;
3446         struct intel_iommu *iommu;
3447
3448         spin_lock_irqsave(&async_umap_flush_lock, flags);
3449         if (list_size == HIGH_WATER_MARK)
3450                 flush_unmaps();
3451
3452         iommu = domain_get_iommu(dom);
3453         iommu_id = iommu->seq_id;
3454
3455         next = deferred_flush[iommu_id].next;
3456         deferred_flush[iommu_id].domain[next] = dom;
3457         deferred_flush[iommu_id].iova[next] = iova;
3458         deferred_flush[iommu_id].freelist[next] = freelist;
3459         deferred_flush[iommu_id].next++;
3460
3461         if (!timer_on) {
3462                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3463                 timer_on = 1;
3464         }
3465         list_size++;
3466         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3467 }
3468
3469 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3470 {
3471         struct dmar_domain *domain;
3472         unsigned long start_pfn, last_pfn;
3473         struct iova *iova;
3474         struct intel_iommu *iommu;
3475         struct page *freelist;
3476
3477         if (iommu_no_mapping(dev))
3478                 return;
3479
3480         domain = find_domain(dev);
3481         BUG_ON(!domain);
3482
3483         iommu = domain_get_iommu(domain);
3484
3485         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3486         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3487                       (unsigned long long)dev_addr))
3488                 return;
3489
3490         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3491         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3492
3493         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3494                  dev_name(dev), start_pfn, last_pfn);
3495
3496         freelist = domain_unmap(domain, start_pfn, last_pfn);
3497
3498         if (intel_iommu_strict) {
3499                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3500                                       last_pfn - start_pfn + 1, !freelist, 0);
3501                 /* free iova */
3502                 __free_iova(&domain->iovad, iova);
3503                 dma_free_pagelist(freelist);
3504         } else {
3505                 add_unmap(domain, iova, freelist);
3506                 /*
3507                  * queue up the release of the unmap to save the 1/6th of the
3508                  * cpu used up by the iotlb flush operation...
3509                  */
3510         }
3511 }
3512
3513 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3514                              size_t size, enum dma_data_direction dir,
3515                              struct dma_attrs *attrs)
3516 {
3517         intel_unmap(dev, dev_addr);
3518 }
3519
3520 static void *intel_alloc_coherent(struct device *dev, size_t size,
3521                                   dma_addr_t *dma_handle, gfp_t flags,
3522                                   struct dma_attrs *attrs)
3523 {
3524         struct page *page = NULL;
3525         int order;
3526
3527         size = PAGE_ALIGN(size);
3528         order = get_order(size);
3529
3530         if (!iommu_no_mapping(dev))
3531                 flags &= ~(GFP_DMA | GFP_DMA32);
3532         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3533                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3534                         flags |= GFP_DMA;
3535                 else
3536                         flags |= GFP_DMA32;
3537         }
3538
3539         if (flags & __GFP_WAIT) {
3540                 unsigned int count = size >> PAGE_SHIFT;
3541
3542                 page = dma_alloc_from_contiguous(dev, count, order);
3543                 if (page && iommu_no_mapping(dev) &&
3544                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3545                         dma_release_from_contiguous(dev, page, count);
3546                         page = NULL;
3547                 }
3548         }
3549
3550         if (!page)
3551                 page = alloc_pages(flags, order);
3552         if (!page)
3553                 return NULL;
3554         memset(page_address(page), 0, size);
3555
3556         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3557                                          DMA_BIDIRECTIONAL,
3558                                          dev->coherent_dma_mask);
3559         if (*dma_handle)
3560                 return page_address(page);
3561         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3562                 __free_pages(page, order);
3563
3564         return NULL;
3565 }
3566
3567 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3568                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3569 {
3570         int order;
3571         struct page *page = virt_to_page(vaddr);
3572
3573         size = PAGE_ALIGN(size);
3574         order = get_order(size);
3575
3576         intel_unmap(dev, dma_handle);
3577         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3578                 __free_pages(page, order);
3579 }
3580
3581 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3582                            int nelems, enum dma_data_direction dir,
3583                            struct dma_attrs *attrs)
3584 {
3585         intel_unmap(dev, sglist[0].dma_address);
3586 }
3587
3588 static int intel_nontranslate_map_sg(struct device *hddev,
3589         struct scatterlist *sglist, int nelems, int dir)
3590 {
3591         int i;
3592         struct scatterlist *sg;
3593
3594         for_each_sg(sglist, sg, nelems, i) {
3595                 BUG_ON(!sg_page(sg));
3596                 sg->dma_address = sg_phys(sg);
3597                 sg->dma_length = sg->length;
3598         }
3599         return nelems;
3600 }
3601
3602 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3603                         enum dma_data_direction dir, struct dma_attrs *attrs)
3604 {
3605         int i;
3606         struct dmar_domain *domain;
3607         size_t size = 0;
3608         int prot = 0;
3609         struct iova *iova = NULL;
3610         int ret;
3611         struct scatterlist *sg;
3612         unsigned long start_vpfn;
3613         struct intel_iommu *iommu;
3614
3615         BUG_ON(dir == DMA_NONE);
3616         if (iommu_no_mapping(dev))
3617                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3618
3619         domain = get_valid_domain_for_dev(dev);
3620         if (!domain)
3621                 return 0;
3622
3623         iommu = domain_get_iommu(domain);
3624
3625         for_each_sg(sglist, sg, nelems, i)
3626                 size += aligned_nrpages(sg->offset, sg->length);
3627
3628         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3629                                 *dev->dma_mask);
3630         if (!iova) {
3631                 sglist->dma_length = 0;
3632                 return 0;
3633         }
3634
3635         /*
3636          * Check if DMAR supports zero-length reads on write only
3637          * mappings..
3638          */
3639         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3640                         !cap_zlr(iommu->cap))
3641                 prot |= DMA_PTE_READ;
3642         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3643                 prot |= DMA_PTE_WRITE;
3644
3645         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3646
3647         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3648         if (unlikely(ret)) {
3649                 dma_pte_free_pagetable(domain, start_vpfn,
3650                                        start_vpfn + size - 1);
3651                 __free_iova(&domain->iovad, iova);
3652                 return 0;
3653         }
3654
3655         /* it's a non-present to present mapping. Only flush if caching mode */
3656         if (cap_caching_mode(iommu->cap))
3657                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3658         else
3659                 iommu_flush_write_buffer(iommu);
3660
3661         return nelems;
3662 }
3663
3664 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3665 {
3666         return !dma_addr;
3667 }
3668
3669 struct dma_map_ops intel_dma_ops = {
3670         .alloc = intel_alloc_coherent,
3671         .free = intel_free_coherent,
3672         .map_sg = intel_map_sg,
3673         .unmap_sg = intel_unmap_sg,
3674         .map_page = intel_map_page,
3675         .unmap_page = intel_unmap_page,
3676         .mapping_error = intel_mapping_error,
3677 };
3678
3679 static inline int iommu_domain_cache_init(void)
3680 {
3681         int ret = 0;
3682
3683         iommu_domain_cache = kmem_cache_create("iommu_domain",
3684                                          sizeof(struct dmar_domain),
3685                                          0,
3686                                          SLAB_HWCACHE_ALIGN,
3687
3688                                          NULL);
3689         if (!iommu_domain_cache) {
3690                 pr_err("Couldn't create iommu_domain cache\n");
3691                 ret = -ENOMEM;
3692         }
3693
3694         return ret;
3695 }
3696
3697 static inline int iommu_devinfo_cache_init(void)
3698 {
3699         int ret = 0;
3700
3701         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3702                                          sizeof(struct device_domain_info),
3703                                          0,
3704                                          SLAB_HWCACHE_ALIGN,
3705                                          NULL);
3706         if (!iommu_devinfo_cache) {
3707                 pr_err("Couldn't create devinfo cache\n");
3708                 ret = -ENOMEM;
3709         }
3710
3711         return ret;
3712 }
3713
3714 static int __init iommu_init_mempool(void)
3715 {
3716         int ret;
3717         ret = iova_cache_get();
3718         if (ret)
3719                 return ret;
3720
3721         ret = iommu_domain_cache_init();
3722         if (ret)
3723                 goto domain_error;
3724
3725         ret = iommu_devinfo_cache_init();
3726         if (!ret)
3727                 return ret;
3728
3729         kmem_cache_destroy(iommu_domain_cache);
3730 domain_error:
3731         iova_cache_put();
3732
3733         return -ENOMEM;
3734 }
3735
3736 static void __init iommu_exit_mempool(void)
3737 {
3738         kmem_cache_destroy(iommu_devinfo_cache);
3739         kmem_cache_destroy(iommu_domain_cache);
3740         iova_cache_put();
3741 }
3742
3743 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3744 {
3745         struct dmar_drhd_unit *drhd;
3746         u32 vtbar;
3747         int rc;
3748
3749         /* We know that this device on this chipset has its own IOMMU.
3750          * If we find it under a different IOMMU, then the BIOS is lying
3751          * to us. Hope that the IOMMU for this device is actually
3752          * disabled, and it needs no translation...
3753          */
3754         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3755         if (rc) {
3756                 /* "can't" happen */
3757                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3758                 return;
3759         }
3760         vtbar &= 0xffff0000;
3761
3762         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3763         drhd = dmar_find_matched_drhd_unit(pdev);
3764         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3765                             TAINT_FIRMWARE_WORKAROUND,
3766                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3767                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3768 }
3769 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3770
3771 static void __init init_no_remapping_devices(void)
3772 {
3773         struct dmar_drhd_unit *drhd;
3774         struct device *dev;
3775         int i;
3776
3777         for_each_drhd_unit(drhd) {
3778                 if (!drhd->include_all) {
3779                         for_each_active_dev_scope(drhd->devices,
3780                                                   drhd->devices_cnt, i, dev)
3781                                 break;
3782                         /* ignore DMAR unit if no devices exist */
3783                         if (i == drhd->devices_cnt)
3784                                 drhd->ignored = 1;
3785                 }
3786         }
3787
3788         for_each_active_drhd_unit(drhd) {
3789                 if (drhd->include_all)
3790                         continue;
3791
3792                 for_each_active_dev_scope(drhd->devices,
3793                                           drhd->devices_cnt, i, dev)
3794                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3795                                 break;
3796                 if (i < drhd->devices_cnt)
3797                         continue;
3798
3799                 /* This IOMMU has *only* gfx devices. Either bypass it or
3800                    set the gfx_mapped flag, as appropriate */
3801                 if (dmar_map_gfx) {
3802                         intel_iommu_gfx_mapped = 1;
3803                 } else {
3804                         drhd->ignored = 1;
3805                         for_each_active_dev_scope(drhd->devices,
3806                                                   drhd->devices_cnt, i, dev)
3807                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3808                 }
3809         }
3810 }
3811
3812 #ifdef CONFIG_SUSPEND
3813 static int init_iommu_hw(void)
3814 {
3815         struct dmar_drhd_unit *drhd;
3816         struct intel_iommu *iommu = NULL;
3817
3818         for_each_active_iommu(iommu, drhd)
3819                 if (iommu->qi)
3820                         dmar_reenable_qi(iommu);
3821
3822         for_each_iommu(iommu, drhd) {
3823                 if (drhd->ignored) {
3824                         /*
3825                          * we always have to disable PMRs or DMA may fail on
3826                          * this device
3827                          */
3828                         if (force_on)
3829                                 iommu_disable_protect_mem_regions(iommu);
3830                         continue;
3831                 }
3832         
3833                 iommu_flush_write_buffer(iommu);
3834
3835                 iommu_set_root_entry(iommu);
3836
3837                 iommu->flush.flush_context(iommu, 0, 0, 0,
3838                                            DMA_CCMD_GLOBAL_INVL);
3839                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3840                 iommu_enable_translation(iommu);
3841                 iommu_disable_protect_mem_regions(iommu);
3842         }
3843
3844         return 0;
3845 }
3846
3847 static void iommu_flush_all(void)
3848 {
3849         struct dmar_drhd_unit *drhd;
3850         struct intel_iommu *iommu;
3851
3852         for_each_active_iommu(iommu, drhd) {
3853                 iommu->flush.flush_context(iommu, 0, 0, 0,
3854                                            DMA_CCMD_GLOBAL_INVL);
3855                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3856                                          DMA_TLB_GLOBAL_FLUSH);
3857         }
3858 }
3859
3860 static int iommu_suspend(void)
3861 {
3862         struct dmar_drhd_unit *drhd;
3863         struct intel_iommu *iommu = NULL;
3864         unsigned long flag;
3865
3866         for_each_active_iommu(iommu, drhd) {
3867                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3868                                                  GFP_ATOMIC);
3869                 if (!iommu->iommu_state)
3870                         goto nomem;
3871         }
3872
3873         iommu_flush_all();
3874
3875         for_each_active_iommu(iommu, drhd) {
3876                 iommu_disable_translation(iommu);
3877
3878                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3879
3880                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3881                         readl(iommu->reg + DMAR_FECTL_REG);
3882                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3883                         readl(iommu->reg + DMAR_FEDATA_REG);
3884                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3885                         readl(iommu->reg + DMAR_FEADDR_REG);
3886                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3887                         readl(iommu->reg + DMAR_FEUADDR_REG);
3888
3889                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3890         }
3891         return 0;
3892
3893 nomem:
3894         for_each_active_iommu(iommu, drhd)
3895                 kfree(iommu->iommu_state);
3896
3897         return -ENOMEM;
3898 }
3899
3900 static void iommu_resume(void)
3901 {
3902         struct dmar_drhd_unit *drhd;
3903         struct intel_iommu *iommu = NULL;
3904         unsigned long flag;
3905
3906         if (init_iommu_hw()) {
3907                 if (force_on)
3908                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3909                 else
3910                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3911                 return;
3912         }
3913
3914         for_each_active_iommu(iommu, drhd) {
3915
3916                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3917
3918                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3919                         iommu->reg + DMAR_FECTL_REG);
3920                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3921                         iommu->reg + DMAR_FEDATA_REG);
3922                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3923                         iommu->reg + DMAR_FEADDR_REG);
3924                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3925                         iommu->reg + DMAR_FEUADDR_REG);
3926
3927                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3928         }
3929
3930         for_each_active_iommu(iommu, drhd)
3931                 kfree(iommu->iommu_state);
3932 }
3933
3934 static struct syscore_ops iommu_syscore_ops = {
3935         .resume         = iommu_resume,
3936         .suspend        = iommu_suspend,
3937 };
3938
3939 static void __init init_iommu_pm_ops(void)
3940 {
3941         register_syscore_ops(&iommu_syscore_ops);
3942 }
3943
3944 #else
3945 static inline void init_iommu_pm_ops(void) {}
3946 #endif  /* CONFIG_PM */
3947
3948
3949 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3950 {
3951         struct acpi_dmar_reserved_memory *rmrr;
3952         struct dmar_rmrr_unit *rmrru;
3953
3954         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3955         if (!rmrru)
3956                 return -ENOMEM;
3957
3958         rmrru->hdr = header;
3959         rmrr = (struct acpi_dmar_reserved_memory *)header;
3960         rmrru->base_address = rmrr->base_address;
3961         rmrru->end_address = rmrr->end_address;
3962         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3963                                 ((void *)rmrr) + rmrr->header.length,
3964                                 &rmrru->devices_cnt);
3965         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3966                 kfree(rmrru);
3967                 return -ENOMEM;
3968         }
3969
3970         list_add(&rmrru->list, &dmar_rmrr_units);
3971
3972         return 0;
3973 }
3974
3975 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3976 {
3977         struct dmar_atsr_unit *atsru;
3978         struct acpi_dmar_atsr *tmp;
3979
3980         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3981                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3982                 if (atsr->segment != tmp->segment)
3983                         continue;
3984                 if (atsr->header.length != tmp->header.length)
3985                         continue;
3986                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3987                         return atsru;
3988         }
3989
3990         return NULL;
3991 }
3992
3993 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3994 {
3995         struct acpi_dmar_atsr *atsr;
3996         struct dmar_atsr_unit *atsru;
3997
3998         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3999                 return 0;
4000
4001         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4002         atsru = dmar_find_atsr(atsr);
4003         if (atsru)
4004                 return 0;
4005
4006         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4007         if (!atsru)
4008                 return -ENOMEM;
4009
4010         /*
4011          * If memory is allocated from slab by ACPI _DSM method, we need to
4012          * copy the memory content because the memory buffer will be freed
4013          * on return.
4014          */
4015         atsru->hdr = (void *)(atsru + 1);
4016         memcpy(atsru->hdr, hdr, hdr->length);
4017         atsru->include_all = atsr->flags & 0x1;
4018         if (!atsru->include_all) {
4019                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4020                                 (void *)atsr + atsr->header.length,
4021                                 &atsru->devices_cnt);
4022                 if (atsru->devices_cnt && atsru->devices == NULL) {
4023                         kfree(atsru);
4024                         return -ENOMEM;
4025                 }
4026         }
4027
4028         list_add_rcu(&atsru->list, &dmar_atsr_units);
4029
4030         return 0;
4031 }
4032
4033 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4034 {
4035         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4036         kfree(atsru);
4037 }
4038
4039 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4040 {
4041         struct acpi_dmar_atsr *atsr;
4042         struct dmar_atsr_unit *atsru;
4043
4044         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4045         atsru = dmar_find_atsr(atsr);
4046         if (atsru) {
4047                 list_del_rcu(&atsru->list);
4048                 synchronize_rcu();
4049                 intel_iommu_free_atsr(atsru);
4050         }
4051
4052         return 0;
4053 }
4054
4055 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4056 {
4057         int i;
4058         struct device *dev;
4059         struct acpi_dmar_atsr *atsr;
4060         struct dmar_atsr_unit *atsru;
4061
4062         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4063         atsru = dmar_find_atsr(atsr);
4064         if (!atsru)
4065                 return 0;
4066
4067         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4068                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4069                                           i, dev)
4070                         return -EBUSY;
4071
4072         return 0;
4073 }
4074
4075 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4076 {
4077         int sp, ret = 0;
4078         struct intel_iommu *iommu = dmaru->iommu;
4079
4080         if (g_iommus[iommu->seq_id])
4081                 return 0;
4082
4083         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4084                 pr_warn("%s: Doesn't support hardware pass through.\n",
4085                         iommu->name);
4086                 return -ENXIO;
4087         }
4088         if (!ecap_sc_support(iommu->ecap) &&
4089             domain_update_iommu_snooping(iommu)) {
4090                 pr_warn("%s: Doesn't support snooping.\n",
4091                         iommu->name);
4092                 return -ENXIO;
4093         }
4094         sp = domain_update_iommu_superpage(iommu) - 1;
4095         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4096                 pr_warn("%s: Doesn't support large page.\n",
4097                         iommu->name);
4098                 return -ENXIO;
4099         }
4100
4101         /*
4102          * Disable translation if already enabled prior to OS handover.
4103          */
4104         if (iommu->gcmd & DMA_GCMD_TE)
4105                 iommu_disable_translation(iommu);
4106
4107         g_iommus[iommu->seq_id] = iommu;
4108         ret = iommu_init_domains(iommu);
4109         if (ret == 0)
4110                 ret = iommu_alloc_root_entry(iommu);
4111         if (ret)
4112                 goto out;
4113
4114         if (dmaru->ignored) {
4115                 /*
4116                  * we always have to disable PMRs or DMA may fail on this device
4117                  */
4118                 if (force_on)
4119                         iommu_disable_protect_mem_regions(iommu);
4120                 return 0;
4121         }
4122
4123         intel_iommu_init_qi(iommu);
4124         iommu_flush_write_buffer(iommu);
4125         ret = dmar_set_interrupt(iommu);
4126         if (ret)
4127                 goto disable_iommu;
4128
4129         iommu_set_root_entry(iommu);
4130         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4131         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4132         iommu_enable_translation(iommu);
4133
4134         iommu_disable_protect_mem_regions(iommu);
4135         return 0;
4136
4137 disable_iommu:
4138         disable_dmar_iommu(iommu);
4139 out:
4140         free_dmar_iommu(iommu);
4141         return ret;
4142 }
4143
4144 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4145 {
4146         int ret = 0;
4147         struct intel_iommu *iommu = dmaru->iommu;
4148
4149         if (!intel_iommu_enabled)
4150                 return 0;
4151         if (iommu == NULL)
4152                 return -EINVAL;
4153
4154         if (insert) {
4155                 ret = intel_iommu_add(dmaru);
4156         } else {
4157                 disable_dmar_iommu(iommu);
4158                 free_dmar_iommu(iommu);
4159         }
4160
4161         return ret;
4162 }
4163
4164 static void intel_iommu_free_dmars(void)
4165 {
4166         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4167         struct dmar_atsr_unit *atsru, *atsr_n;
4168
4169         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4170                 list_del(&rmrru->list);
4171                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4172                 kfree(rmrru);
4173         }
4174
4175         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4176                 list_del(&atsru->list);
4177                 intel_iommu_free_atsr(atsru);
4178         }
4179 }
4180
4181 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4182 {
4183         int i, ret = 1;
4184         struct pci_bus *bus;
4185         struct pci_dev *bridge = NULL;
4186         struct device *tmp;
4187         struct acpi_dmar_atsr *atsr;
4188         struct dmar_atsr_unit *atsru;
4189
4190         dev = pci_physfn(dev);
4191         for (bus = dev->bus; bus; bus = bus->parent) {
4192                 bridge = bus->self;
4193                 if (!bridge || !pci_is_pcie(bridge) ||
4194                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4195                         return 0;
4196                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4197                         break;
4198         }
4199         if (!bridge)
4200                 return 0;
4201
4202         rcu_read_lock();
4203         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4204                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4205                 if (atsr->segment != pci_domain_nr(dev->bus))
4206                         continue;
4207
4208                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4209                         if (tmp == &bridge->dev)
4210                                 goto out;
4211
4212                 if (atsru->include_all)
4213                         goto out;
4214         }
4215         ret = 0;
4216 out:
4217         rcu_read_unlock();
4218
4219         return ret;
4220 }
4221
4222 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4223 {
4224         int ret = 0;
4225         struct dmar_rmrr_unit *rmrru;
4226         struct dmar_atsr_unit *atsru;
4227         struct acpi_dmar_atsr *atsr;
4228         struct acpi_dmar_reserved_memory *rmrr;
4229
4230         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4231                 return 0;
4232
4233         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4234                 rmrr = container_of(rmrru->hdr,
4235                                     struct acpi_dmar_reserved_memory, header);
4236                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4237                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4238                                 ((void *)rmrr) + rmrr->header.length,
4239                                 rmrr->segment, rmrru->devices,
4240                                 rmrru->devices_cnt);
4241                         if(ret < 0)
4242                                 return ret;
4243                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4244                         dmar_remove_dev_scope(info, rmrr->segment,
4245                                 rmrru->devices, rmrru->devices_cnt);
4246                 }
4247         }
4248
4249         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4250                 if (atsru->include_all)
4251                         continue;
4252
4253                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4254                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4255                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4256                                         (void *)atsr + atsr->header.length,
4257                                         atsr->segment, atsru->devices,
4258                                         atsru->devices_cnt);
4259                         if (ret > 0)
4260                                 break;
4261                         else if(ret < 0)
4262                                 return ret;
4263                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4264                         if (dmar_remove_dev_scope(info, atsr->segment,
4265                                         atsru->devices, atsru->devices_cnt))
4266                                 break;
4267                 }
4268         }
4269
4270         return 0;
4271 }
4272
4273 /*
4274  * Here we only respond to action of unbound device from driver.
4275  *
4276  * Added device is not attached to its DMAR domain here yet. That will happen
4277  * when mapping the device to iova.
4278  */
4279 static int device_notifier(struct notifier_block *nb,
4280                                   unsigned long action, void *data)
4281 {
4282         struct device *dev = data;
4283         struct dmar_domain *domain;
4284
4285         if (iommu_dummy(dev))
4286                 return 0;
4287
4288         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4289                 return 0;
4290
4291         domain = find_domain(dev);
4292         if (!domain)
4293                 return 0;
4294
4295         dmar_remove_one_dev_info(domain, dev);
4296         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4297                 domain_exit(domain);
4298
4299         return 0;
4300 }
4301
4302 static struct notifier_block device_nb = {
4303         .notifier_call = device_notifier,
4304 };
4305
4306 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4307                                        unsigned long val, void *v)
4308 {
4309         struct memory_notify *mhp = v;
4310         unsigned long long start, end;
4311         unsigned long start_vpfn, last_vpfn;
4312
4313         switch (val) {
4314         case MEM_GOING_ONLINE:
4315                 start = mhp->start_pfn << PAGE_SHIFT;
4316                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4317                 if (iommu_domain_identity_map(si_domain, start, end)) {
4318                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4319                                 start, end);
4320                         return NOTIFY_BAD;
4321                 }
4322                 break;
4323
4324         case MEM_OFFLINE:
4325         case MEM_CANCEL_ONLINE:
4326                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4327                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4328                 while (start_vpfn <= last_vpfn) {
4329                         struct iova *iova;
4330                         struct dmar_drhd_unit *drhd;
4331                         struct intel_iommu *iommu;
4332                         struct page *freelist;
4333
4334                         iova = find_iova(&si_domain->iovad, start_vpfn);
4335                         if (iova == NULL) {
4336                                 pr_debug("Failed get IOVA for PFN %lx\n",
4337                                          start_vpfn);
4338                                 break;
4339                         }
4340
4341                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4342                                                      start_vpfn, last_vpfn);
4343                         if (iova == NULL) {
4344                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4345                                         start_vpfn, last_vpfn);
4346                                 return NOTIFY_BAD;
4347                         }
4348
4349                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4350                                                iova->pfn_hi);
4351
4352                         rcu_read_lock();
4353                         for_each_active_iommu(iommu, drhd)
4354                                 iommu_flush_iotlb_psi(iommu, si_domain,
4355                                         iova->pfn_lo, iova_size(iova),
4356                                         !freelist, 0);
4357                         rcu_read_unlock();
4358                         dma_free_pagelist(freelist);
4359
4360                         start_vpfn = iova->pfn_hi + 1;
4361                         free_iova_mem(iova);
4362                 }
4363                 break;
4364         }
4365
4366         return NOTIFY_OK;
4367 }
4368
4369 static struct notifier_block intel_iommu_memory_nb = {
4370         .notifier_call = intel_iommu_memory_notifier,
4371         .priority = 0
4372 };
4373
4374
4375 static ssize_t intel_iommu_show_version(struct device *dev,
4376                                         struct device_attribute *attr,
4377                                         char *buf)
4378 {
4379         struct intel_iommu *iommu = dev_get_drvdata(dev);
4380         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4381         return sprintf(buf, "%d:%d\n",
4382                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4383 }
4384 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4385
4386 static ssize_t intel_iommu_show_address(struct device *dev,
4387                                         struct device_attribute *attr,
4388                                         char *buf)
4389 {
4390         struct intel_iommu *iommu = dev_get_drvdata(dev);
4391         return sprintf(buf, "%llx\n", iommu->reg_phys);
4392 }
4393 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4394
4395 static ssize_t intel_iommu_show_cap(struct device *dev,
4396                                     struct device_attribute *attr,
4397                                     char *buf)
4398 {
4399         struct intel_iommu *iommu = dev_get_drvdata(dev);
4400         return sprintf(buf, "%llx\n", iommu->cap);
4401 }
4402 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4403
4404 static ssize_t intel_iommu_show_ecap(struct device *dev,
4405                                     struct device_attribute *attr,
4406                                     char *buf)
4407 {
4408         struct intel_iommu *iommu = dev_get_drvdata(dev);
4409         return sprintf(buf, "%llx\n", iommu->ecap);
4410 }
4411 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4412
4413 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4414                                       struct device_attribute *attr,
4415                                       char *buf)
4416 {
4417         struct intel_iommu *iommu = dev_get_drvdata(dev);
4418         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4419 }
4420 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4421
4422 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4423                                            struct device_attribute *attr,
4424                                            char *buf)
4425 {
4426         struct intel_iommu *iommu = dev_get_drvdata(dev);
4427         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4428                                                   cap_ndoms(iommu->cap)));
4429 }
4430 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4431
4432 static struct attribute *intel_iommu_attrs[] = {
4433         &dev_attr_version.attr,
4434         &dev_attr_address.attr,
4435         &dev_attr_cap.attr,
4436         &dev_attr_ecap.attr,
4437         &dev_attr_domains_supported.attr,
4438         &dev_attr_domains_used.attr,
4439         NULL,
4440 };
4441
4442 static struct attribute_group intel_iommu_group = {
4443         .name = "intel-iommu",
4444         .attrs = intel_iommu_attrs,
4445 };
4446
4447 const struct attribute_group *intel_iommu_groups[] = {
4448         &intel_iommu_group,
4449         NULL,
4450 };
4451
4452 int __init intel_iommu_init(void)
4453 {
4454         int ret = -ENODEV;
4455         struct dmar_drhd_unit *drhd;
4456         struct intel_iommu *iommu;
4457
4458         /* VT-d is required for a TXT/tboot launch, so enforce that */
4459         force_on = tboot_force_iommu();
4460
4461         if (iommu_init_mempool()) {
4462                 if (force_on)
4463                         panic("tboot: Failed to initialize iommu memory\n");
4464                 return -ENOMEM;
4465         }
4466
4467         down_write(&dmar_global_lock);
4468         if (dmar_table_init()) {
4469                 if (force_on)
4470                         panic("tboot: Failed to initialize DMAR table\n");
4471                 goto out_free_dmar;
4472         }
4473
4474         if (dmar_dev_scope_init() < 0) {
4475                 if (force_on)
4476                         panic("tboot: Failed to initialize DMAR device scope\n");
4477                 goto out_free_dmar;
4478         }
4479
4480         if (no_iommu || dmar_disabled)
4481                 goto out_free_dmar;
4482
4483         if (list_empty(&dmar_rmrr_units))
4484                 pr_info("No RMRR found\n");
4485
4486         if (list_empty(&dmar_atsr_units))
4487                 pr_info("No ATSR found\n");
4488
4489         if (dmar_init_reserved_ranges()) {
4490                 if (force_on)
4491                         panic("tboot: Failed to reserve iommu ranges\n");
4492                 goto out_free_reserved_range;
4493         }
4494
4495         init_no_remapping_devices();
4496
4497         ret = init_dmars();
4498         if (ret) {
4499                 if (force_on)
4500                         panic("tboot: Failed to initialize DMARs\n");
4501                 pr_err("Initialization failed\n");
4502                 goto out_free_reserved_range;
4503         }
4504         up_write(&dmar_global_lock);
4505         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4506
4507         init_timer(&unmap_timer);
4508 #ifdef CONFIG_SWIOTLB
4509         swiotlb = 0;
4510 #endif
4511         dma_ops = &intel_dma_ops;
4512
4513         init_iommu_pm_ops();
4514
4515         for_each_active_iommu(iommu, drhd)
4516                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4517                                                        intel_iommu_groups,
4518                                                        "%s", iommu->name);
4519
4520         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4521         bus_register_notifier(&pci_bus_type, &device_nb);
4522         if (si_domain && !hw_pass_through)
4523                 register_memory_notifier(&intel_iommu_memory_nb);
4524
4525         intel_iommu_enabled = 1;
4526
4527         return 0;
4528
4529 out_free_reserved_range:
4530         put_iova_domain(&reserved_iova_list);
4531 out_free_dmar:
4532         intel_iommu_free_dmars();
4533         up_write(&dmar_global_lock);
4534         iommu_exit_mempool();
4535         return ret;
4536 }
4537
4538 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4539 {
4540         struct intel_iommu *iommu = opaque;
4541
4542         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4543         return 0;
4544 }
4545
4546 /*
4547  * NB - intel-iommu lacks any sort of reference counting for the users of
4548  * dependent devices.  If multiple endpoints have intersecting dependent
4549  * devices, unbinding the driver from any one of them will possibly leave
4550  * the others unable to operate.
4551  */
4552 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4553 {
4554         if (!iommu || !dev || !dev_is_pci(dev))
4555                 return;
4556
4557         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4558 }
4559
4560 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4561 {
4562         struct intel_iommu *iommu;
4563         unsigned long flags;
4564
4565         assert_spin_locked(&device_domain_lock);
4566
4567         if (WARN_ON(!info))
4568                 return;
4569
4570         iommu = info->iommu;
4571
4572         if (info->dev) {
4573                 iommu_disable_dev_iotlb(info);
4574                 domain_context_clear(iommu, info->dev);
4575         }
4576
4577         unlink_domain_info(info);
4578
4579         spin_lock_irqsave(&iommu->lock, flags);
4580         domain_detach_iommu(info->domain, iommu);
4581         spin_unlock_irqrestore(&iommu->lock, flags);
4582
4583         free_devinfo_mem(info);
4584 }
4585
4586 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4587                                      struct device *dev)
4588 {
4589         struct device_domain_info *info;
4590         unsigned long flags;
4591
4592         spin_lock_irqsave(&device_domain_lock, flags);
4593         info = dev->archdata.iommu;
4594         __dmar_remove_one_dev_info(info);
4595         spin_unlock_irqrestore(&device_domain_lock, flags);
4596 }
4597
4598 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4599 {
4600         int adjust_width;
4601
4602         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4603                         DMA_32BIT_PFN);
4604         domain_reserve_special_ranges(domain);
4605
4606         /* calculate AGAW */
4607         domain->gaw = guest_width;
4608         adjust_width = guestwidth_to_adjustwidth(guest_width);
4609         domain->agaw = width_to_agaw(adjust_width);
4610
4611         domain->iommu_coherency = 0;
4612         domain->iommu_snooping = 0;
4613         domain->iommu_superpage = 0;
4614         domain->max_addr = 0;
4615
4616         /* always allocate the top pgd */
4617         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4618         if (!domain->pgd)
4619                 return -ENOMEM;
4620         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4621         return 0;
4622 }
4623
4624 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4625 {
4626         struct dmar_domain *dmar_domain;
4627         struct iommu_domain *domain;
4628
4629         if (type != IOMMU_DOMAIN_UNMANAGED)
4630                 return NULL;
4631
4632         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4633         if (!dmar_domain) {
4634                 pr_err("Can't allocate dmar_domain\n");
4635                 return NULL;
4636         }
4637         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4638                 pr_err("Domain initialization failed\n");
4639                 domain_exit(dmar_domain);
4640                 return NULL;
4641         }
4642         domain_update_iommu_cap(dmar_domain);
4643
4644         domain = &dmar_domain->domain;
4645         domain->geometry.aperture_start = 0;
4646         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4647         domain->geometry.force_aperture = true;
4648
4649         return domain;
4650 }
4651
4652 static void intel_iommu_domain_free(struct iommu_domain *domain)
4653 {
4654         domain_exit(to_dmar_domain(domain));
4655 }
4656
4657 static int intel_iommu_attach_device(struct iommu_domain *domain,
4658                                      struct device *dev)
4659 {
4660         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4661         struct intel_iommu *iommu;
4662         int addr_width;
4663         u8 bus, devfn;
4664
4665         if (device_is_rmrr_locked(dev)) {
4666                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4667                 return -EPERM;
4668         }
4669
4670         /* normally dev is not mapped */
4671         if (unlikely(domain_context_mapped(dev))) {
4672                 struct dmar_domain *old_domain;
4673
4674                 old_domain = find_domain(dev);
4675                 if (old_domain) {
4676                         rcu_read_lock();
4677                         dmar_remove_one_dev_info(old_domain, dev);
4678                         rcu_read_unlock();
4679
4680                         if (!domain_type_is_vm_or_si(old_domain) &&
4681                              list_empty(&old_domain->devices))
4682                                 domain_exit(old_domain);
4683                 }
4684         }
4685
4686         iommu = device_to_iommu(dev, &bus, &devfn);
4687         if (!iommu)
4688                 return -ENODEV;
4689
4690         /* check if this iommu agaw is sufficient for max mapped address */
4691         addr_width = agaw_to_width(iommu->agaw);
4692         if (addr_width > cap_mgaw(iommu->cap))
4693                 addr_width = cap_mgaw(iommu->cap);
4694
4695         if (dmar_domain->max_addr > (1LL << addr_width)) {
4696                 pr_err("%s: iommu width (%d) is not "
4697                        "sufficient for the mapped address (%llx)\n",
4698                        __func__, addr_width, dmar_domain->max_addr);
4699                 return -EFAULT;
4700         }
4701         dmar_domain->gaw = addr_width;
4702
4703         /*
4704          * Knock out extra levels of page tables if necessary
4705          */
4706         while (iommu->agaw < dmar_domain->agaw) {
4707                 struct dma_pte *pte;
4708
4709                 pte = dmar_domain->pgd;
4710                 if (dma_pte_present(pte)) {
4711                         dmar_domain->pgd = (struct dma_pte *)
4712                                 phys_to_virt(dma_pte_addr(pte));
4713                         free_pgtable_page(pte);
4714                 }
4715                 dmar_domain->agaw--;
4716         }
4717
4718         return domain_add_dev_info(dmar_domain, dev);
4719 }
4720
4721 static void intel_iommu_detach_device(struct iommu_domain *domain,
4722                                       struct device *dev)
4723 {
4724         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4725 }
4726
4727 static int intel_iommu_map(struct iommu_domain *domain,
4728                            unsigned long iova, phys_addr_t hpa,
4729                            size_t size, int iommu_prot)
4730 {
4731         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732         u64 max_addr;
4733         int prot = 0;
4734         int ret;
4735
4736         if (iommu_prot & IOMMU_READ)
4737                 prot |= DMA_PTE_READ;
4738         if (iommu_prot & IOMMU_WRITE)
4739                 prot |= DMA_PTE_WRITE;
4740         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4741                 prot |= DMA_PTE_SNP;
4742
4743         max_addr = iova + size;
4744         if (dmar_domain->max_addr < max_addr) {
4745                 u64 end;
4746
4747                 /* check if minimum agaw is sufficient for mapped address */
4748                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4749                 if (end < max_addr) {
4750                         pr_err("%s: iommu width (%d) is not "
4751                                "sufficient for the mapped address (%llx)\n",
4752                                __func__, dmar_domain->gaw, max_addr);
4753                         return -EFAULT;
4754                 }
4755                 dmar_domain->max_addr = max_addr;
4756         }
4757         /* Round up size to next multiple of PAGE_SIZE, if it and
4758            the low bits of hpa would take us onto the next page */
4759         size = aligned_nrpages(hpa, size);
4760         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4761                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4762         return ret;
4763 }
4764
4765 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4766                                 unsigned long iova, size_t size)
4767 {
4768         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4769         struct page *freelist = NULL;
4770         struct intel_iommu *iommu;
4771         unsigned long start_pfn, last_pfn;
4772         unsigned int npages;
4773         int iommu_id, level = 0;
4774
4775         /* Cope with horrid API which requires us to unmap more than the
4776            size argument if it happens to be a large-page mapping. */
4777         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4778
4779         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4780                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4781
4782         start_pfn = iova >> VTD_PAGE_SHIFT;
4783         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4784
4785         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4786
4787         npages = last_pfn - start_pfn + 1;
4788
4789         for_each_domain_iommu(iommu_id, dmar_domain) {
4790                 iommu = g_iommus[iommu_id];
4791
4792                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4793                                       start_pfn, npages, !freelist, 0);
4794         }
4795
4796         dma_free_pagelist(freelist);
4797
4798         if (dmar_domain->max_addr == iova + size)
4799                 dmar_domain->max_addr = iova;
4800
4801         return size;
4802 }
4803
4804 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4805                                             dma_addr_t iova)
4806 {
4807         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4808         struct dma_pte *pte;
4809         int level = 0;
4810         u64 phys = 0;
4811
4812         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4813         if (pte)
4814                 phys = dma_pte_addr(pte);
4815
4816         return phys;
4817 }
4818
4819 static bool intel_iommu_capable(enum iommu_cap cap)
4820 {
4821         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4822                 return domain_update_iommu_snooping(NULL) == 1;
4823         if (cap == IOMMU_CAP_INTR_REMAP)
4824                 return irq_remapping_enabled == 1;
4825
4826         return false;
4827 }
4828
4829 static int intel_iommu_add_device(struct device *dev)
4830 {
4831         struct intel_iommu *iommu;
4832         struct iommu_group *group;
4833         u8 bus, devfn;
4834
4835         iommu = device_to_iommu(dev, &bus, &devfn);
4836         if (!iommu)
4837                 return -ENODEV;
4838
4839         iommu_device_link(iommu->iommu_dev, dev);
4840
4841         group = iommu_group_get_for_dev(dev);
4842
4843         if (IS_ERR(group))
4844                 return PTR_ERR(group);
4845
4846         iommu_group_put(group);
4847         return 0;
4848 }
4849
4850 static void intel_iommu_remove_device(struct device *dev)
4851 {
4852         struct intel_iommu *iommu;
4853         u8 bus, devfn;
4854
4855         iommu = device_to_iommu(dev, &bus, &devfn);
4856         if (!iommu)
4857                 return;
4858
4859         iommu_group_remove_device(dev);
4860
4861         iommu_device_unlink(iommu->iommu_dev, dev);
4862 }
4863
4864 static const struct iommu_ops intel_iommu_ops = {
4865         .capable        = intel_iommu_capable,
4866         .domain_alloc   = intel_iommu_domain_alloc,
4867         .domain_free    = intel_iommu_domain_free,
4868         .attach_dev     = intel_iommu_attach_device,
4869         .detach_dev     = intel_iommu_detach_device,
4870         .map            = intel_iommu_map,
4871         .unmap          = intel_iommu_unmap,
4872         .map_sg         = default_iommu_map_sg,
4873         .iova_to_phys   = intel_iommu_iova_to_phys,
4874         .add_device     = intel_iommu_add_device,
4875         .remove_device  = intel_iommu_remove_device,
4876         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4877 };
4878
4879 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4880 {
4881         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4882         pr_info("Disabling IOMMU for graphics on this chipset\n");
4883         dmar_map_gfx = 0;
4884 }
4885
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4893
4894 static void quirk_iommu_rwbf(struct pci_dev *dev)
4895 {
4896         /*
4897          * Mobile 4 Series Chipset neglects to set RWBF capability,
4898          * but needs it. Same seems to hold for the desktop versions.
4899          */
4900         pr_info("Forcing write-buffer flush capability\n");
4901         rwbf_quirk = 1;
4902 }
4903
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4911
4912 #define GGC 0x52
4913 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4914 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4915 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4916 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4917 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4918 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4919 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4920 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4921
4922 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4923 {
4924         unsigned short ggc;
4925
4926         if (pci_read_config_word(dev, GGC, &ggc))
4927                 return;
4928
4929         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4930                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4931                 dmar_map_gfx = 0;
4932         } else if (dmar_map_gfx) {
4933                 /* we have to ensure the gfx device is idle before we flush */
4934                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4935                 intel_iommu_strict = 1;
4936        }
4937 }
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4942
4943 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4944    ISOCH DMAR unit for the Azalia sound device, but not give it any
4945    TLB entries, which causes it to deadlock. Check for that.  We do
4946    this in a function called from init_dmars(), instead of in a PCI
4947    quirk, because we don't want to print the obnoxious "BIOS broken"
4948    message if VT-d is actually disabled.
4949 */
4950 static void __init check_tylersburg_isoch(void)
4951 {
4952         struct pci_dev *pdev;
4953         uint32_t vtisochctrl;
4954
4955         /* If there's no Azalia in the system anyway, forget it. */
4956         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4957         if (!pdev)
4958                 return;
4959         pci_dev_put(pdev);
4960
4961         /* System Management Registers. Might be hidden, in which case
4962            we can't do the sanity check. But that's OK, because the
4963            known-broken BIOSes _don't_ actually hide it, so far. */
4964         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4965         if (!pdev)
4966                 return;
4967
4968         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4969                 pci_dev_put(pdev);
4970                 return;
4971         }
4972
4973         pci_dev_put(pdev);
4974
4975         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4976         if (vtisochctrl & 1)
4977                 return;
4978
4979         /* Drop all bits other than the number of TLB entries */
4980         vtisochctrl &= 0x1c;
4981
4982         /* If we have the recommended number of TLB entries (16), fine. */
4983         if (vtisochctrl == 0x10)
4984                 return;
4985
4986         /* Zero TLB entries? You get to ride the short bus to school. */
4987         if (!vtisochctrl) {
4988                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4989                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4990                      dmi_get_system_info(DMI_BIOS_VENDOR),
4991                      dmi_get_system_info(DMI_BIOS_VERSION),
4992                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4993                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4994                 return;
4995         }
4996
4997         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4998                vtisochctrl);
4999 }