]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
arm: imx6: defconfig: update tx6 defconfigs
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50 #include "pci.h"
51
52 #define ROOT_SIZE               VTD_PAGE_SIZE
53 #define CONTEXT_SIZE            VTD_PAGE_SIZE
54
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
64
65 #define MAX_AGAW_WIDTH 64
66
67 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
68 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
69
70 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
71    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
72 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
73                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
74 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
75
76 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
77 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
78 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
79
80 /* page table handling */
81 #define LEVEL_STRIDE            (9)
82 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
83
84 /*
85  * This bitmap is used to advertise the page sizes our hardware support
86  * to the IOMMU core, which will then use this information to split
87  * physically contiguous memory regions it is mapping into page sizes
88  * that we support.
89  *
90  * Traditionally the IOMMU core just handed us the mappings directly,
91  * after making sure the size is an order of a 4KiB page and that the
92  * mapping has natural alignment.
93  *
94  * To retain this behavior, we currently advertise that we support
95  * all page sizes that are an order of 4KiB.
96  *
97  * If at some point we'd like to utilize the IOMMU core's new behavior,
98  * we could change this to advertise the real page sizes we support.
99  */
100 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
101
102 static inline int agaw_to_level(int agaw)
103 {
104         return agaw + 2;
105 }
106
107 static inline int agaw_to_width(int agaw)
108 {
109         return 30 + agaw * LEVEL_STRIDE;
110 }
111
112 static inline int width_to_agaw(int width)
113 {
114         return (width - 30) / LEVEL_STRIDE;
115 }
116
117 static inline unsigned int level_to_offset_bits(int level)
118 {
119         return (level - 1) * LEVEL_STRIDE;
120 }
121
122 static inline int pfn_level_offset(unsigned long pfn, int level)
123 {
124         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 }
126
127 static inline unsigned long level_mask(int level)
128 {
129         return -1UL << level_to_offset_bits(level);
130 }
131
132 static inline unsigned long level_size(int level)
133 {
134         return 1UL << level_to_offset_bits(level);
135 }
136
137 static inline unsigned long align_to_level(unsigned long pfn, int level)
138 {
139         return (pfn + level_size(level) - 1) & level_mask(level);
140 }
141
142 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
143 {
144         return  1 << ((lvl - 1) * LEVEL_STRIDE);
145 }
146
147 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
148    are never going to work. */
149 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
150 {
151         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 }
153
154 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
155 {
156         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
157 }
158 static inline unsigned long page_to_dma_pfn(struct page *pg)
159 {
160         return mm_to_dma_pfn(page_to_pfn(pg));
161 }
162 static inline unsigned long virt_to_dma_pfn(void *p)
163 {
164         return page_to_dma_pfn(virt_to_page(p));
165 }
166
167 /* global iommu list, set NULL for ignored DMAR units */
168 static struct intel_iommu **g_iommus;
169
170 static void __init check_tylersburg_isoch(void);
171 static int rwbf_quirk;
172
173 /*
174  * set to 1 to panic kernel if can't successfully enable VT-d
175  * (used when kernel is launched w/ TXT)
176  */
177 static int force_on = 0;
178
179 /*
180  * 0: Present
181  * 1-11: Reserved
182  * 12-63: Context Ptr (12 - (haw-1))
183  * 64-127: Reserved
184  */
185 struct root_entry {
186         u64     val;
187         u64     rsvd1;
188 };
189 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
190 static inline bool root_present(struct root_entry *root)
191 {
192         return (root->val & 1);
193 }
194 static inline void set_root_present(struct root_entry *root)
195 {
196         root->val |= 1;
197 }
198 static inline void set_root_value(struct root_entry *root, unsigned long value)
199 {
200         root->val |= value & VTD_PAGE_MASK;
201 }
202
203 static inline struct context_entry *
204 get_context_addr_from_root(struct root_entry *root)
205 {
206         return (struct context_entry *)
207                 (root_present(root)?phys_to_virt(
208                 root->val & VTD_PAGE_MASK) :
209                 NULL);
210 }
211
212 /*
213  * low 64 bits:
214  * 0: present
215  * 1: fault processing disable
216  * 2-3: translation type
217  * 12-63: address space root
218  * high 64 bits:
219  * 0-2: address width
220  * 3-6: aval
221  * 8-23: domain id
222  */
223 struct context_entry {
224         u64 lo;
225         u64 hi;
226 };
227
228 static inline bool context_present(struct context_entry *context)
229 {
230         return (context->lo & 1);
231 }
232 static inline void context_set_present(struct context_entry *context)
233 {
234         context->lo |= 1;
235 }
236
237 static inline void context_set_fault_enable(struct context_entry *context)
238 {
239         context->lo &= (((u64)-1) << 2) | 1;
240 }
241
242 static inline void context_set_translation_type(struct context_entry *context,
243                                                 unsigned long value)
244 {
245         context->lo &= (((u64)-1) << 4) | 3;
246         context->lo |= (value & 3) << 2;
247 }
248
249 static inline void context_set_address_root(struct context_entry *context,
250                                             unsigned long value)
251 {
252         context->lo |= value & VTD_PAGE_MASK;
253 }
254
255 static inline void context_set_address_width(struct context_entry *context,
256                                              unsigned long value)
257 {
258         context->hi |= value & 7;
259 }
260
261 static inline void context_set_domain_id(struct context_entry *context,
262                                          unsigned long value)
263 {
264         context->hi |= (value & ((1 << 16) - 1)) << 8;
265 }
266
267 static inline void context_clear_entry(struct context_entry *context)
268 {
269         context->lo = 0;
270         context->hi = 0;
271 }
272
273 /*
274  * 0: readable
275  * 1: writable
276  * 2-6: reserved
277  * 7: super page
278  * 8-10: available
279  * 11: snoop behavior
280  * 12-63: Host physcial address
281  */
282 struct dma_pte {
283         u64 val;
284 };
285
286 static inline void dma_clear_pte(struct dma_pte *pte)
287 {
288         pte->val = 0;
289 }
290
291 static inline void dma_set_pte_readable(struct dma_pte *pte)
292 {
293         pte->val |= DMA_PTE_READ;
294 }
295
296 static inline void dma_set_pte_writable(struct dma_pte *pte)
297 {
298         pte->val |= DMA_PTE_WRITE;
299 }
300
301 static inline void dma_set_pte_snp(struct dma_pte *pte)
302 {
303         pte->val |= DMA_PTE_SNP;
304 }
305
306 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
307 {
308         pte->val = (pte->val & ~3) | (prot & 3);
309 }
310
311 static inline u64 dma_pte_addr(struct dma_pte *pte)
312 {
313 #ifdef CONFIG_64BIT
314         return pte->val & VTD_PAGE_MASK;
315 #else
316         /* Must have a full atomic 64-bit read */
317         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
318 #endif
319 }
320
321 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
322 {
323         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
324 }
325
326 static inline bool dma_pte_present(struct dma_pte *pte)
327 {
328         return (pte->val & 3) != 0;
329 }
330
331 static inline bool dma_pte_superpage(struct dma_pte *pte)
332 {
333         return (pte->val & (1 << 7));
334 }
335
336 static inline int first_pte_in_page(struct dma_pte *pte)
337 {
338         return !((unsigned long)pte & ~VTD_PAGE_MASK);
339 }
340
341 /*
342  * This domain is a statically identity mapping domain.
343  *      1. This domain creats a static 1:1 mapping to all usable memory.
344  *      2. It maps to each iommu if successful.
345  *      3. Each iommu mapps to this domain if successful.
346  */
347 static struct dmar_domain *si_domain;
348 static int hw_pass_through = 1;
349
350 /* devices under the same p2p bridge are owned in one domain */
351 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
352
353 /* domain represents a virtual machine, more than one devices
354  * across iommus may be owned in one domain, e.g. kvm guest.
355  */
356 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
357
358 /* si_domain contains mulitple devices */
359 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
360
361 /* define the limit of IOMMUs supported in each domain */
362 #ifdef  CONFIG_X86
363 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
364 #else
365 # define        IOMMU_UNITS_SUPPORTED   64
366 #endif
367
368 struct dmar_domain {
369         int     id;                     /* domain id */
370         int     nid;                    /* node id */
371         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
372                                         /* bitmap of iommus this domain uses*/
373
374         struct list_head devices;       /* all devices' list */
375         struct iova_domain iovad;       /* iova's that belong to this domain */
376
377         struct dma_pte  *pgd;           /* virtual address */
378         int             gaw;            /* max guest address width */
379
380         /* adjusted guest address width, 0 is level 2 30-bit */
381         int             agaw;
382
383         int             flags;          /* flags to find out type of domain */
384
385         int             iommu_coherency;/* indicate coherency of iommu access */
386         int             iommu_snooping; /* indicate snooping control feature*/
387         int             iommu_count;    /* reference count of iommu */
388         int             iommu_superpage;/* Level of superpages supported:
389                                            0 == 4KiB (no superpages), 1 == 2MiB,
390                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
391         spinlock_t      iommu_lock;     /* protect iommu set in domain */
392         u64             max_addr;       /* maximum mapped address */
393 };
394
395 /* PCI domain-device relationship */
396 struct device_domain_info {
397         struct list_head link;  /* link to domain siblings */
398         struct list_head global; /* link to global list */
399         int segment;            /* PCI domain */
400         u8 bus;                 /* PCI bus number */
401         u8 devfn;               /* PCI devfn number */
402         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
403         struct intel_iommu *iommu; /* IOMMU used by this device */
404         struct dmar_domain *domain; /* pointer to domain */
405 };
406
407 static void flush_unmaps_timeout(unsigned long data);
408
409 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
410
411 #define HIGH_WATER_MARK 250
412 struct deferred_flush_tables {
413         int next;
414         struct iova *iova[HIGH_WATER_MARK];
415         struct dmar_domain *domain[HIGH_WATER_MARK];
416 };
417
418 static struct deferred_flush_tables *deferred_flush;
419
420 /* bitmap for indexing intel_iommus */
421 static int g_num_of_iommus;
422
423 static DEFINE_SPINLOCK(async_umap_flush_lock);
424 static LIST_HEAD(unmaps_to_do);
425
426 static int timer_on;
427 static long list_size;
428
429 static void domain_remove_dev_info(struct dmar_domain *domain);
430
431 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
432 int dmar_disabled = 0;
433 #else
434 int dmar_disabled = 1;
435 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
436
437 int intel_iommu_enabled = 0;
438 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
439
440 static int dmar_map_gfx = 1;
441 static int dmar_forcedac;
442 static int intel_iommu_strict;
443 static int intel_iommu_superpage = 1;
444
445 int intel_iommu_gfx_mapped;
446 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
447
448 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
449 static DEFINE_SPINLOCK(device_domain_lock);
450 static LIST_HEAD(device_domain_list);
451
452 static struct iommu_ops intel_iommu_ops;
453
454 static int __init intel_iommu_setup(char *str)
455 {
456         if (!str)
457                 return -EINVAL;
458         while (*str) {
459                 if (!strncmp(str, "on", 2)) {
460                         dmar_disabled = 0;
461                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
462                 } else if (!strncmp(str, "off", 3)) {
463                         dmar_disabled = 1;
464                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
465                 } else if (!strncmp(str, "igfx_off", 8)) {
466                         dmar_map_gfx = 0;
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: disable GFX device mapping\n");
469                 } else if (!strncmp(str, "forcedac", 8)) {
470                         printk(KERN_INFO
471                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
472                         dmar_forcedac = 1;
473                 } else if (!strncmp(str, "strict", 6)) {
474                         printk(KERN_INFO
475                                 "Intel-IOMMU: disable batched IOTLB flush\n");
476                         intel_iommu_strict = 1;
477                 } else if (!strncmp(str, "sp_off", 6)) {
478                         printk(KERN_INFO
479                                 "Intel-IOMMU: disable supported super page\n");
480                         intel_iommu_superpage = 0;
481                 }
482
483                 str += strcspn(str, ",");
484                 while (*str == ',')
485                         str++;
486         }
487         return 0;
488 }
489 __setup("intel_iommu=", intel_iommu_setup);
490
491 static struct kmem_cache *iommu_domain_cache;
492 static struct kmem_cache *iommu_devinfo_cache;
493 static struct kmem_cache *iommu_iova_cache;
494
495 static inline void *alloc_pgtable_page(int node)
496 {
497         struct page *page;
498         void *vaddr = NULL;
499
500         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
501         if (page)
502                 vaddr = page_address(page);
503         return vaddr;
504 }
505
506 static inline void free_pgtable_page(void *vaddr)
507 {
508         free_page((unsigned long)vaddr);
509 }
510
511 static inline void *alloc_domain_mem(void)
512 {
513         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
514 }
515
516 static void free_domain_mem(void *vaddr)
517 {
518         kmem_cache_free(iommu_domain_cache, vaddr);
519 }
520
521 static inline void * alloc_devinfo_mem(void)
522 {
523         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
524 }
525
526 static inline void free_devinfo_mem(void *vaddr)
527 {
528         kmem_cache_free(iommu_devinfo_cache, vaddr);
529 }
530
531 struct iova *alloc_iova_mem(void)
532 {
533         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
534 }
535
536 void free_iova_mem(struct iova *iova)
537 {
538         kmem_cache_free(iommu_iova_cache, iova);
539 }
540
541
542 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
543 {
544         unsigned long sagaw;
545         int agaw = -1;
546
547         sagaw = cap_sagaw(iommu->cap);
548         for (agaw = width_to_agaw(max_gaw);
549              agaw >= 0; agaw--) {
550                 if (test_bit(agaw, &sagaw))
551                         break;
552         }
553
554         return agaw;
555 }
556
557 /*
558  * Calculate max SAGAW for each iommu.
559  */
560 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
561 {
562         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
563 }
564
565 /*
566  * calculate agaw for each iommu.
567  * "SAGAW" may be different across iommus, use a default agaw, and
568  * get a supported less agaw for iommus that don't support the default agaw.
569  */
570 int iommu_calculate_agaw(struct intel_iommu *iommu)
571 {
572         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
573 }
574
575 /* This functionin only returns single iommu in a domain */
576 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
577 {
578         int iommu_id;
579
580         /* si_domain and vm domain should not get here. */
581         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
582         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
583
584         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
585         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
586                 return NULL;
587
588         return g_iommus[iommu_id];
589 }
590
591 static void domain_update_iommu_coherency(struct dmar_domain *domain)
592 {
593         int i;
594
595         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
596
597         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
598
599         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
600                 if (!ecap_coherent(g_iommus[i]->ecap)) {
601                         domain->iommu_coherency = 0;
602                         break;
603                 }
604         }
605 }
606
607 static void domain_update_iommu_snooping(struct dmar_domain *domain)
608 {
609         int i;
610
611         domain->iommu_snooping = 1;
612
613         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
614                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
615                         domain->iommu_snooping = 0;
616                         break;
617                 }
618         }
619 }
620
621 static void domain_update_iommu_superpage(struct dmar_domain *domain)
622 {
623         struct dmar_drhd_unit *drhd;
624         struct intel_iommu *iommu = NULL;
625         int mask = 0xf;
626
627         if (!intel_iommu_superpage) {
628                 domain->iommu_superpage = 0;
629                 return;
630         }
631
632         /* set iommu_superpage to the smallest common denominator */
633         for_each_active_iommu(iommu, drhd) {
634                 mask &= cap_super_page_val(iommu->cap);
635                 if (!mask) {
636                         break;
637                 }
638         }
639         domain->iommu_superpage = fls(mask);
640 }
641
642 /* Some capabilities may be different across iommus */
643 static void domain_update_iommu_cap(struct dmar_domain *domain)
644 {
645         domain_update_iommu_coherency(domain);
646         domain_update_iommu_snooping(domain);
647         domain_update_iommu_superpage(domain);
648 }
649
650 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
651 {
652         struct dmar_drhd_unit *drhd = NULL;
653         int i;
654
655         for_each_drhd_unit(drhd) {
656                 if (drhd->ignored)
657                         continue;
658                 if (segment != drhd->segment)
659                         continue;
660
661                 for (i = 0; i < drhd->devices_cnt; i++) {
662                         if (drhd->devices[i] &&
663                             drhd->devices[i]->bus->number == bus &&
664                             drhd->devices[i]->devfn == devfn)
665                                 return drhd->iommu;
666                         if (drhd->devices[i] &&
667                             drhd->devices[i]->subordinate &&
668                             drhd->devices[i]->subordinate->number <= bus &&
669                             drhd->devices[i]->subordinate->busn_res.end >= bus)
670                                 return drhd->iommu;
671                 }
672
673                 if (drhd->include_all)
674                         return drhd->iommu;
675         }
676
677         return NULL;
678 }
679
680 static void domain_flush_cache(struct dmar_domain *domain,
681                                void *addr, int size)
682 {
683         if (!domain->iommu_coherency)
684                 clflush_cache_range(addr, size);
685 }
686
687 /* Gets context entry for a given bus and devfn */
688 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
689                 u8 bus, u8 devfn)
690 {
691         struct root_entry *root;
692         struct context_entry *context;
693         unsigned long phy_addr;
694         unsigned long flags;
695
696         spin_lock_irqsave(&iommu->lock, flags);
697         root = &iommu->root_entry[bus];
698         context = get_context_addr_from_root(root);
699         if (!context) {
700                 context = (struct context_entry *)
701                                 alloc_pgtable_page(iommu->node);
702                 if (!context) {
703                         spin_unlock_irqrestore(&iommu->lock, flags);
704                         return NULL;
705                 }
706                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
707                 phy_addr = virt_to_phys((void *)context);
708                 set_root_value(root, phy_addr);
709                 set_root_present(root);
710                 __iommu_flush_cache(iommu, root, sizeof(*root));
711         }
712         spin_unlock_irqrestore(&iommu->lock, flags);
713         return &context[devfn];
714 }
715
716 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
717 {
718         struct root_entry *root;
719         struct context_entry *context;
720         int ret;
721         unsigned long flags;
722
723         spin_lock_irqsave(&iommu->lock, flags);
724         root = &iommu->root_entry[bus];
725         context = get_context_addr_from_root(root);
726         if (!context) {
727                 ret = 0;
728                 goto out;
729         }
730         ret = context_present(&context[devfn]);
731 out:
732         spin_unlock_irqrestore(&iommu->lock, flags);
733         return ret;
734 }
735
736 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
737 {
738         struct root_entry *root;
739         struct context_entry *context;
740         unsigned long flags;
741
742         spin_lock_irqsave(&iommu->lock, flags);
743         root = &iommu->root_entry[bus];
744         context = get_context_addr_from_root(root);
745         if (context) {
746                 context_clear_entry(&context[devfn]);
747                 __iommu_flush_cache(iommu, &context[devfn], \
748                         sizeof(*context));
749         }
750         spin_unlock_irqrestore(&iommu->lock, flags);
751 }
752
753 static void free_context_table(struct intel_iommu *iommu)
754 {
755         struct root_entry *root;
756         int i;
757         unsigned long flags;
758         struct context_entry *context;
759
760         spin_lock_irqsave(&iommu->lock, flags);
761         if (!iommu->root_entry) {
762                 goto out;
763         }
764         for (i = 0; i < ROOT_ENTRY_NR; i++) {
765                 root = &iommu->root_entry[i];
766                 context = get_context_addr_from_root(root);
767                 if (context)
768                         free_pgtable_page(context);
769         }
770         free_pgtable_page(iommu->root_entry);
771         iommu->root_entry = NULL;
772 out:
773         spin_unlock_irqrestore(&iommu->lock, flags);
774 }
775
776 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
777                                       unsigned long pfn, int target_level)
778 {
779         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
780         struct dma_pte *parent, *pte = NULL;
781         int level = agaw_to_level(domain->agaw);
782         int offset;
783
784         BUG_ON(!domain->pgd);
785         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
786         parent = domain->pgd;
787
788         while (level > 0) {
789                 void *tmp_page;
790
791                 offset = pfn_level_offset(pfn, level);
792                 pte = &parent[offset];
793                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
794                         break;
795                 if (level == target_level)
796                         break;
797
798                 if (!dma_pte_present(pte)) {
799                         uint64_t pteval;
800
801                         tmp_page = alloc_pgtable_page(domain->nid);
802
803                         if (!tmp_page)
804                                 return NULL;
805
806                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
807                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
808                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
809                                 /* Someone else set it while we were thinking; use theirs. */
810                                 free_pgtable_page(tmp_page);
811                         } else {
812                                 dma_pte_addr(pte);
813                                 domain_flush_cache(domain, pte, sizeof(*pte));
814                         }
815                 }
816                 parent = phys_to_virt(dma_pte_addr(pte));
817                 level--;
818         }
819
820         return pte;
821 }
822
823
824 /* return address's pte at specific level */
825 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
826                                          unsigned long pfn,
827                                          int level, int *large_page)
828 {
829         struct dma_pte *parent, *pte = NULL;
830         int total = agaw_to_level(domain->agaw);
831         int offset;
832
833         parent = domain->pgd;
834         while (level <= total) {
835                 offset = pfn_level_offset(pfn, total);
836                 pte = &parent[offset];
837                 if (level == total)
838                         return pte;
839
840                 if (!dma_pte_present(pte)) {
841                         *large_page = total;
842                         break;
843                 }
844
845                 if (pte->val & DMA_PTE_LARGE_PAGE) {
846                         *large_page = total;
847                         return pte;
848                 }
849
850                 parent = phys_to_virt(dma_pte_addr(pte));
851                 total--;
852         }
853         return NULL;
854 }
855
856 /* clear last level pte, a tlb flush should be followed */
857 static int dma_pte_clear_range(struct dmar_domain *domain,
858                                 unsigned long start_pfn,
859                                 unsigned long last_pfn)
860 {
861         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
862         unsigned int large_page = 1;
863         struct dma_pte *first_pte, *pte;
864         int order;
865
866         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
867         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
868         BUG_ON(start_pfn > last_pfn);
869
870         /* we don't need lock here; nobody else touches the iova range */
871         do {
872                 large_page = 1;
873                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
874                 if (!pte) {
875                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
876                         continue;
877                 }
878                 do {
879                         dma_clear_pte(pte);
880                         start_pfn += lvl_to_nr_pages(large_page);
881                         pte++;
882                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
883
884                 domain_flush_cache(domain, first_pte,
885                                    (void *)pte - (void *)first_pte);
886
887         } while (start_pfn && start_pfn <= last_pfn);
888
889         order = (large_page - 1) * 9;
890         return order;
891 }
892
893 static void dma_pte_free_level(struct dmar_domain *domain, int level,
894                                struct dma_pte *pte, unsigned long pfn,
895                                unsigned long start_pfn, unsigned long last_pfn)
896 {
897         pfn = max(start_pfn, pfn);
898         pte = &pte[pfn_level_offset(pfn, level)];
899
900         do {
901                 unsigned long level_pfn;
902                 struct dma_pte *level_pte;
903
904                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
905                         goto next;
906
907                 level_pfn = pfn & level_mask(level - 1);
908                 level_pte = phys_to_virt(dma_pte_addr(pte));
909
910                 if (level > 2)
911                         dma_pte_free_level(domain, level - 1, level_pte,
912                                            level_pfn, start_pfn, last_pfn);
913
914                 /* If range covers entire pagetable, free it */
915                 if (!(start_pfn > level_pfn ||
916                       last_pfn < level_pfn + level_size(level))) {
917                         dma_clear_pte(pte);
918                         domain_flush_cache(domain, pte, sizeof(*pte));
919                         free_pgtable_page(level_pte);
920                 }
921 next:
922                 pfn += level_size(level);
923         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
924 }
925
926 /* free page table pages. last level pte should already be cleared */
927 static void dma_pte_free_pagetable(struct dmar_domain *domain,
928                                    unsigned long start_pfn,
929                                    unsigned long last_pfn)
930 {
931         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
932
933         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
934         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
935         BUG_ON(start_pfn > last_pfn);
936
937         /* We don't need lock here; nobody else touches the iova range */
938         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
939                            domain->pgd, 0, start_pfn, last_pfn);
940
941         /* free pgd */
942         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
943                 free_pgtable_page(domain->pgd);
944                 domain->pgd = NULL;
945         }
946 }
947
948 /* iommu handling */
949 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
950 {
951         struct root_entry *root;
952         unsigned long flags;
953
954         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
955         if (!root)
956                 return -ENOMEM;
957
958         __iommu_flush_cache(iommu, root, ROOT_SIZE);
959
960         spin_lock_irqsave(&iommu->lock, flags);
961         iommu->root_entry = root;
962         spin_unlock_irqrestore(&iommu->lock, flags);
963
964         return 0;
965 }
966
967 static void iommu_set_root_entry(struct intel_iommu *iommu)
968 {
969         void *addr;
970         u32 sts;
971         unsigned long flag;
972
973         addr = iommu->root_entry;
974
975         raw_spin_lock_irqsave(&iommu->register_lock, flag);
976         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
977
978         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
979
980         /* Make sure hardware complete it */
981         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
982                       readl, (sts & DMA_GSTS_RTPS), sts);
983
984         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
985 }
986
987 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
988 {
989         u32 val;
990         unsigned long flag;
991
992         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
993                 return;
994
995         raw_spin_lock_irqsave(&iommu->register_lock, flag);
996         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
997
998         /* Make sure hardware complete it */
999         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1000                       readl, (!(val & DMA_GSTS_WBFS)), val);
1001
1002         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1003 }
1004
1005 /* return value determine if we need a write buffer flush */
1006 static void __iommu_flush_context(struct intel_iommu *iommu,
1007                                   u16 did, u16 source_id, u8 function_mask,
1008                                   u64 type)
1009 {
1010         u64 val = 0;
1011         unsigned long flag;
1012
1013         switch (type) {
1014         case DMA_CCMD_GLOBAL_INVL:
1015                 val = DMA_CCMD_GLOBAL_INVL;
1016                 break;
1017         case DMA_CCMD_DOMAIN_INVL:
1018                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1019                 break;
1020         case DMA_CCMD_DEVICE_INVL:
1021                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1022                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1023                 break;
1024         default:
1025                 BUG();
1026         }
1027         val |= DMA_CCMD_ICC;
1028
1029         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1030         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1031
1032         /* Make sure hardware complete it */
1033         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1034                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1035
1036         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1037 }
1038
1039 /* return value determine if we need a write buffer flush */
1040 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1041                                 u64 addr, unsigned int size_order, u64 type)
1042 {
1043         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1044         u64 val = 0, val_iva = 0;
1045         unsigned long flag;
1046
1047         switch (type) {
1048         case DMA_TLB_GLOBAL_FLUSH:
1049                 /* global flush doesn't need set IVA_REG */
1050                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1051                 break;
1052         case DMA_TLB_DSI_FLUSH:
1053                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                 break;
1055         case DMA_TLB_PSI_FLUSH:
1056                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1057                 /* Note: always flush non-leaf currently */
1058                 val_iva = size_order | addr;
1059                 break;
1060         default:
1061                 BUG();
1062         }
1063         /* Note: set drain read/write */
1064 #if 0
1065         /*
1066          * This is probably to be super secure.. Looks like we can
1067          * ignore it without any impact.
1068          */
1069         if (cap_read_drain(iommu->cap))
1070                 val |= DMA_TLB_READ_DRAIN;
1071 #endif
1072         if (cap_write_drain(iommu->cap))
1073                 val |= DMA_TLB_WRITE_DRAIN;
1074
1075         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1076         /* Note: Only uses first TLB reg currently */
1077         if (val_iva)
1078                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1079         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1080
1081         /* Make sure hardware complete it */
1082         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1083                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1084
1085         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086
1087         /* check IOTLB invalidation granularity */
1088         if (DMA_TLB_IAIG(val) == 0)
1089                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1090         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1091                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1092                         (unsigned long long)DMA_TLB_IIRG(type),
1093                         (unsigned long long)DMA_TLB_IAIG(val));
1094 }
1095
1096 static struct device_domain_info *iommu_support_dev_iotlb(
1097         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1098 {
1099         int found = 0;
1100         unsigned long flags;
1101         struct device_domain_info *info;
1102         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1103
1104         if (!ecap_dev_iotlb_support(iommu->ecap))
1105                 return NULL;
1106
1107         if (!iommu->qi)
1108                 return NULL;
1109
1110         spin_lock_irqsave(&device_domain_lock, flags);
1111         list_for_each_entry(info, &domain->devices, link)
1112                 if (info->bus == bus && info->devfn == devfn) {
1113                         found = 1;
1114                         break;
1115                 }
1116         spin_unlock_irqrestore(&device_domain_lock, flags);
1117
1118         if (!found || !info->dev)
1119                 return NULL;
1120
1121         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1122                 return NULL;
1123
1124         if (!dmar_find_matched_atsr_unit(info->dev))
1125                 return NULL;
1126
1127         info->iommu = iommu;
1128
1129         return info;
1130 }
1131
1132 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1133 {
1134         if (!info)
1135                 return;
1136
1137         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1138 }
1139
1140 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1141 {
1142         if (!info->dev || !pci_ats_enabled(info->dev))
1143                 return;
1144
1145         pci_disable_ats(info->dev);
1146 }
1147
1148 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1149                                   u64 addr, unsigned mask)
1150 {
1151         u16 sid, qdep;
1152         unsigned long flags;
1153         struct device_domain_info *info;
1154
1155         spin_lock_irqsave(&device_domain_lock, flags);
1156         list_for_each_entry(info, &domain->devices, link) {
1157                 if (!info->dev || !pci_ats_enabled(info->dev))
1158                         continue;
1159
1160                 sid = info->bus << 8 | info->devfn;
1161                 qdep = pci_ats_queue_depth(info->dev);
1162                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1163         }
1164         spin_unlock_irqrestore(&device_domain_lock, flags);
1165 }
1166
1167 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1168                                   unsigned long pfn, unsigned int pages, int map)
1169 {
1170         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1171         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1172
1173         BUG_ON(pages == 0);
1174
1175         /*
1176          * Fallback to domain selective flush if no PSI support or the size is
1177          * too big.
1178          * PSI requires page size to be 2 ^ x, and the base address is naturally
1179          * aligned to the size
1180          */
1181         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1182                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1183                                                 DMA_TLB_DSI_FLUSH);
1184         else
1185                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1186                                                 DMA_TLB_PSI_FLUSH);
1187
1188         /*
1189          * In caching mode, changes of pages from non-present to present require
1190          * flush. However, device IOTLB doesn't need to be flushed in this case.
1191          */
1192         if (!cap_caching_mode(iommu->cap) || !map)
1193                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1194 }
1195
1196 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1197 {
1198         u32 pmen;
1199         unsigned long flags;
1200
1201         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1202         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1203         pmen &= ~DMA_PMEN_EPM;
1204         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1205
1206         /* wait for the protected region status bit to clear */
1207         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1208                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1209
1210         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1211 }
1212
1213 static int iommu_enable_translation(struct intel_iommu *iommu)
1214 {
1215         u32 sts;
1216         unsigned long flags;
1217
1218         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1219         iommu->gcmd |= DMA_GCMD_TE;
1220         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1221
1222         /* Make sure hardware complete it */
1223         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224                       readl, (sts & DMA_GSTS_TES), sts);
1225
1226         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227         return 0;
1228 }
1229
1230 static int iommu_disable_translation(struct intel_iommu *iommu)
1231 {
1232         u32 sts;
1233         unsigned long flag;
1234
1235         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236         iommu->gcmd &= ~DMA_GCMD_TE;
1237         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1238
1239         /* Make sure hardware complete it */
1240         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1241                       readl, (!(sts & DMA_GSTS_TES)), sts);
1242
1243         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1244         return 0;
1245 }
1246
1247
1248 static int iommu_init_domains(struct intel_iommu *iommu)
1249 {
1250         unsigned long ndomains;
1251         unsigned long nlongs;
1252
1253         ndomains = cap_ndoms(iommu->cap);
1254         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1255                         ndomains);
1256         nlongs = BITS_TO_LONGS(ndomains);
1257
1258         spin_lock_init(&iommu->lock);
1259
1260         /* TBD: there might be 64K domains,
1261          * consider other allocation for future chip
1262          */
1263         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1264         if (!iommu->domain_ids) {
1265                 printk(KERN_ERR "Allocating domain id array failed\n");
1266                 return -ENOMEM;
1267         }
1268         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1269                         GFP_KERNEL);
1270         if (!iommu->domains) {
1271                 printk(KERN_ERR "Allocating domain array failed\n");
1272                 return -ENOMEM;
1273         }
1274
1275         /*
1276          * if Caching mode is set, then invalid translations are tagged
1277          * with domainid 0. Hence we need to pre-allocate it.
1278          */
1279         if (cap_caching_mode(iommu->cap))
1280                 set_bit(0, iommu->domain_ids);
1281         return 0;
1282 }
1283
1284
1285 static void domain_exit(struct dmar_domain *domain);
1286 static void vm_domain_exit(struct dmar_domain *domain);
1287
1288 void free_dmar_iommu(struct intel_iommu *iommu)
1289 {
1290         struct dmar_domain *domain;
1291         int i;
1292         unsigned long flags;
1293
1294         if ((iommu->domains) && (iommu->domain_ids)) {
1295                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1296                         domain = iommu->domains[i];
1297                         clear_bit(i, iommu->domain_ids);
1298
1299                         spin_lock_irqsave(&domain->iommu_lock, flags);
1300                         if (--domain->iommu_count == 0) {
1301                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1302                                         vm_domain_exit(domain);
1303                                 else
1304                                         domain_exit(domain);
1305                         }
1306                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1307                 }
1308         }
1309
1310         if (iommu->gcmd & DMA_GCMD_TE)
1311                 iommu_disable_translation(iommu);
1312
1313         if (iommu->irq) {
1314                 irq_set_handler_data(iommu->irq, NULL);
1315                 /* This will mask the irq */
1316                 free_irq(iommu->irq, iommu);
1317                 destroy_irq(iommu->irq);
1318         }
1319
1320         kfree(iommu->domains);
1321         kfree(iommu->domain_ids);
1322
1323         g_iommus[iommu->seq_id] = NULL;
1324
1325         /* if all iommus are freed, free g_iommus */
1326         for (i = 0; i < g_num_of_iommus; i++) {
1327                 if (g_iommus[i])
1328                         break;
1329         }
1330
1331         if (i == g_num_of_iommus)
1332                 kfree(g_iommus);
1333
1334         /* free context mapping */
1335         free_context_table(iommu);
1336 }
1337
1338 static struct dmar_domain *alloc_domain(void)
1339 {
1340         struct dmar_domain *domain;
1341
1342         domain = alloc_domain_mem();
1343         if (!domain)
1344                 return NULL;
1345
1346         domain->nid = -1;
1347         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1348         domain->flags = 0;
1349
1350         return domain;
1351 }
1352
1353 static int iommu_attach_domain(struct dmar_domain *domain,
1354                                struct intel_iommu *iommu)
1355 {
1356         int num;
1357         unsigned long ndomains;
1358         unsigned long flags;
1359
1360         ndomains = cap_ndoms(iommu->cap);
1361
1362         spin_lock_irqsave(&iommu->lock, flags);
1363
1364         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1365         if (num >= ndomains) {
1366                 spin_unlock_irqrestore(&iommu->lock, flags);
1367                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1368                 return -ENOMEM;
1369         }
1370
1371         domain->id = num;
1372         set_bit(num, iommu->domain_ids);
1373         set_bit(iommu->seq_id, domain->iommu_bmp);
1374         iommu->domains[num] = domain;
1375         spin_unlock_irqrestore(&iommu->lock, flags);
1376
1377         return 0;
1378 }
1379
1380 static void iommu_detach_domain(struct dmar_domain *domain,
1381                                 struct intel_iommu *iommu)
1382 {
1383         unsigned long flags;
1384         int num, ndomains;
1385         int found = 0;
1386
1387         spin_lock_irqsave(&iommu->lock, flags);
1388         ndomains = cap_ndoms(iommu->cap);
1389         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1390                 if (iommu->domains[num] == domain) {
1391                         found = 1;
1392                         break;
1393                 }
1394         }
1395
1396         if (found) {
1397                 clear_bit(num, iommu->domain_ids);
1398                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1399                 iommu->domains[num] = NULL;
1400         }
1401         spin_unlock_irqrestore(&iommu->lock, flags);
1402 }
1403
1404 static struct iova_domain reserved_iova_list;
1405 static struct lock_class_key reserved_rbtree_key;
1406
1407 static int dmar_init_reserved_ranges(void)
1408 {
1409         struct pci_dev *pdev = NULL;
1410         struct iova *iova;
1411         int i;
1412
1413         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1414
1415         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1416                 &reserved_rbtree_key);
1417
1418         /* IOAPIC ranges shouldn't be accessed by DMA */
1419         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1420                 IOVA_PFN(IOAPIC_RANGE_END));
1421         if (!iova) {
1422                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1423                 return -ENODEV;
1424         }
1425
1426         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1427         for_each_pci_dev(pdev) {
1428                 struct resource *r;
1429
1430                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1431                         r = &pdev->resource[i];
1432                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1433                                 continue;
1434                         iova = reserve_iova(&reserved_iova_list,
1435                                             IOVA_PFN(r->start),
1436                                             IOVA_PFN(r->end));
1437                         if (!iova) {
1438                                 printk(KERN_ERR "Reserve iova failed\n");
1439                                 return -ENODEV;
1440                         }
1441                 }
1442         }
1443         return 0;
1444 }
1445
1446 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1447 {
1448         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1449 }
1450
1451 static inline int guestwidth_to_adjustwidth(int gaw)
1452 {
1453         int agaw;
1454         int r = (gaw - 12) % 9;
1455
1456         if (r == 0)
1457                 agaw = gaw;
1458         else
1459                 agaw = gaw + 9 - r;
1460         if (agaw > 64)
1461                 agaw = 64;
1462         return agaw;
1463 }
1464
1465 static int domain_init(struct dmar_domain *domain, int guest_width)
1466 {
1467         struct intel_iommu *iommu;
1468         int adjust_width, agaw;
1469         unsigned long sagaw;
1470
1471         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1472         spin_lock_init(&domain->iommu_lock);
1473
1474         domain_reserve_special_ranges(domain);
1475
1476         /* calculate AGAW */
1477         iommu = domain_get_iommu(domain);
1478         if (guest_width > cap_mgaw(iommu->cap))
1479                 guest_width = cap_mgaw(iommu->cap);
1480         domain->gaw = guest_width;
1481         adjust_width = guestwidth_to_adjustwidth(guest_width);
1482         agaw = width_to_agaw(adjust_width);
1483         sagaw = cap_sagaw(iommu->cap);
1484         if (!test_bit(agaw, &sagaw)) {
1485                 /* hardware doesn't support it, choose a bigger one */
1486                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1487                 agaw = find_next_bit(&sagaw, 5, agaw);
1488                 if (agaw >= 5)
1489                         return -ENODEV;
1490         }
1491         domain->agaw = agaw;
1492         INIT_LIST_HEAD(&domain->devices);
1493
1494         if (ecap_coherent(iommu->ecap))
1495                 domain->iommu_coherency = 1;
1496         else
1497                 domain->iommu_coherency = 0;
1498
1499         if (ecap_sc_support(iommu->ecap))
1500                 domain->iommu_snooping = 1;
1501         else
1502                 domain->iommu_snooping = 0;
1503
1504         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1505         domain->iommu_count = 1;
1506         domain->nid = iommu->node;
1507
1508         /* always allocate the top pgd */
1509         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1510         if (!domain->pgd)
1511                 return -ENOMEM;
1512         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1513         return 0;
1514 }
1515
1516 static void domain_exit(struct dmar_domain *domain)
1517 {
1518         struct dmar_drhd_unit *drhd;
1519         struct intel_iommu *iommu;
1520
1521         /* Domain 0 is reserved, so dont process it */
1522         if (!domain)
1523                 return;
1524
1525         /* Flush any lazy unmaps that may reference this domain */
1526         if (!intel_iommu_strict)
1527                 flush_unmaps_timeout(0);
1528
1529         domain_remove_dev_info(domain);
1530         /* destroy iovas */
1531         put_iova_domain(&domain->iovad);
1532
1533         /* clear ptes */
1534         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536         /* free page tables */
1537         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1538
1539         for_each_active_iommu(iommu, drhd)
1540                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1541                         iommu_detach_domain(domain, iommu);
1542
1543         free_domain_mem(domain);
1544 }
1545
1546 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1547                                  u8 bus, u8 devfn, int translation)
1548 {
1549         struct context_entry *context;
1550         unsigned long flags;
1551         struct intel_iommu *iommu;
1552         struct dma_pte *pgd;
1553         unsigned long num;
1554         unsigned long ndomains;
1555         int id;
1556         int agaw;
1557         struct device_domain_info *info = NULL;
1558
1559         pr_debug("Set context mapping for %02x:%02x.%d\n",
1560                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1561
1562         BUG_ON(!domain->pgd);
1563         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1564                translation != CONTEXT_TT_MULTI_LEVEL);
1565
1566         iommu = device_to_iommu(segment, bus, devfn);
1567         if (!iommu)
1568                 return -ENODEV;
1569
1570         context = device_to_context_entry(iommu, bus, devfn);
1571         if (!context)
1572                 return -ENOMEM;
1573         spin_lock_irqsave(&iommu->lock, flags);
1574         if (context_present(context)) {
1575                 spin_unlock_irqrestore(&iommu->lock, flags);
1576                 return 0;
1577         }
1578
1579         id = domain->id;
1580         pgd = domain->pgd;
1581
1582         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1583             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1584                 int found = 0;
1585
1586                 /* find an available domain id for this device in iommu */
1587                 ndomains = cap_ndoms(iommu->cap);
1588                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1589                         if (iommu->domains[num] == domain) {
1590                                 id = num;
1591                                 found = 1;
1592                                 break;
1593                         }
1594                 }
1595
1596                 if (found == 0) {
1597                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1598                         if (num >= ndomains) {
1599                                 spin_unlock_irqrestore(&iommu->lock, flags);
1600                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1601                                 return -EFAULT;
1602                         }
1603
1604                         set_bit(num, iommu->domain_ids);
1605                         iommu->domains[num] = domain;
1606                         id = num;
1607                 }
1608
1609                 /* Skip top levels of page tables for
1610                  * iommu which has less agaw than default.
1611                  * Unnecessary for PT mode.
1612                  */
1613                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1614                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1615                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1616                                 if (!dma_pte_present(pgd)) {
1617                                         spin_unlock_irqrestore(&iommu->lock, flags);
1618                                         return -ENOMEM;
1619                                 }
1620                         }
1621                 }
1622         }
1623
1624         context_set_domain_id(context, id);
1625
1626         if (translation != CONTEXT_TT_PASS_THROUGH) {
1627                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1628                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1629                                      CONTEXT_TT_MULTI_LEVEL;
1630         }
1631         /*
1632          * In pass through mode, AW must be programmed to indicate the largest
1633          * AGAW value supported by hardware. And ASR is ignored by hardware.
1634          */
1635         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1636                 context_set_address_width(context, iommu->msagaw);
1637         else {
1638                 context_set_address_root(context, virt_to_phys(pgd));
1639                 context_set_address_width(context, iommu->agaw);
1640         }
1641
1642         context_set_translation_type(context, translation);
1643         context_set_fault_enable(context);
1644         context_set_present(context);
1645         domain_flush_cache(domain, context, sizeof(*context));
1646
1647         /*
1648          * It's a non-present to present mapping. If hardware doesn't cache
1649          * non-present entry we only need to flush the write-buffer. If the
1650          * _does_ cache non-present entries, then it does so in the special
1651          * domain #0, which we have to flush:
1652          */
1653         if (cap_caching_mode(iommu->cap)) {
1654                 iommu->flush.flush_context(iommu, 0,
1655                                            (((u16)bus) << 8) | devfn,
1656                                            DMA_CCMD_MASK_NOBIT,
1657                                            DMA_CCMD_DEVICE_INVL);
1658                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1659         } else {
1660                 iommu_flush_write_buffer(iommu);
1661         }
1662         iommu_enable_dev_iotlb(info);
1663         spin_unlock_irqrestore(&iommu->lock, flags);
1664
1665         spin_lock_irqsave(&domain->iommu_lock, flags);
1666         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1667                 domain->iommu_count++;
1668                 if (domain->iommu_count == 1)
1669                         domain->nid = iommu->node;
1670                 domain_update_iommu_cap(domain);
1671         }
1672         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1673         return 0;
1674 }
1675
1676 static int
1677 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1678                         int translation)
1679 {
1680         int ret;
1681         struct pci_dev *tmp, *parent;
1682
1683         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1684                                          pdev->bus->number, pdev->devfn,
1685                                          translation);
1686         if (ret)
1687                 return ret;
1688
1689         /* dependent device mapping */
1690         tmp = pci_find_upstream_pcie_bridge(pdev);
1691         if (!tmp)
1692                 return 0;
1693         /* Secondary interface's bus number and devfn 0 */
1694         parent = pdev->bus->self;
1695         while (parent != tmp) {
1696                 ret = domain_context_mapping_one(domain,
1697                                                  pci_domain_nr(parent->bus),
1698                                                  parent->bus->number,
1699                                                  parent->devfn, translation);
1700                 if (ret)
1701                         return ret;
1702                 parent = parent->bus->self;
1703         }
1704         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1705                 return domain_context_mapping_one(domain,
1706                                         pci_domain_nr(tmp->subordinate),
1707                                         tmp->subordinate->number, 0,
1708                                         translation);
1709         else /* this is a legacy PCI bridge */
1710                 return domain_context_mapping_one(domain,
1711                                                   pci_domain_nr(tmp->bus),
1712                                                   tmp->bus->number,
1713                                                   tmp->devfn,
1714                                                   translation);
1715 }
1716
1717 static int domain_context_mapped(struct pci_dev *pdev)
1718 {
1719         int ret;
1720         struct pci_dev *tmp, *parent;
1721         struct intel_iommu *iommu;
1722
1723         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1724                                 pdev->devfn);
1725         if (!iommu)
1726                 return -ENODEV;
1727
1728         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1729         if (!ret)
1730                 return ret;
1731         /* dependent device mapping */
1732         tmp = pci_find_upstream_pcie_bridge(pdev);
1733         if (!tmp)
1734                 return ret;
1735         /* Secondary interface's bus number and devfn 0 */
1736         parent = pdev->bus->self;
1737         while (parent != tmp) {
1738                 ret = device_context_mapped(iommu, parent->bus->number,
1739                                             parent->devfn);
1740                 if (!ret)
1741                         return ret;
1742                 parent = parent->bus->self;
1743         }
1744         if (pci_is_pcie(tmp))
1745                 return device_context_mapped(iommu, tmp->subordinate->number,
1746                                              0);
1747         else
1748                 return device_context_mapped(iommu, tmp->bus->number,
1749                                              tmp->devfn);
1750 }
1751
1752 /* Returns a number of VTD pages, but aligned to MM page size */
1753 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1754                                             size_t size)
1755 {
1756         host_addr &= ~PAGE_MASK;
1757         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1758 }
1759
1760 /* Return largest possible superpage level for a given mapping */
1761 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1762                                           unsigned long iov_pfn,
1763                                           unsigned long phy_pfn,
1764                                           unsigned long pages)
1765 {
1766         int support, level = 1;
1767         unsigned long pfnmerge;
1768
1769         support = domain->iommu_superpage;
1770
1771         /* To use a large page, the virtual *and* physical addresses
1772            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1773            of them will mean we have to use smaller pages. So just
1774            merge them and check both at once. */
1775         pfnmerge = iov_pfn | phy_pfn;
1776
1777         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1778                 pages >>= VTD_STRIDE_SHIFT;
1779                 if (!pages)
1780                         break;
1781                 pfnmerge >>= VTD_STRIDE_SHIFT;
1782                 level++;
1783                 support--;
1784         }
1785         return level;
1786 }
1787
1788 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1789                             struct scatterlist *sg, unsigned long phys_pfn,
1790                             unsigned long nr_pages, int prot)
1791 {
1792         struct dma_pte *first_pte = NULL, *pte = NULL;
1793         phys_addr_t uninitialized_var(pteval);
1794         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1795         unsigned long sg_res;
1796         unsigned int largepage_lvl = 0;
1797         unsigned long lvl_pages = 0;
1798
1799         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1800
1801         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1802                 return -EINVAL;
1803
1804         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1805
1806         if (sg)
1807                 sg_res = 0;
1808         else {
1809                 sg_res = nr_pages + 1;
1810                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1811         }
1812
1813         while (nr_pages > 0) {
1814                 uint64_t tmp;
1815
1816                 if (!sg_res) {
1817                         sg_res = aligned_nrpages(sg->offset, sg->length);
1818                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1819                         sg->dma_length = sg->length;
1820                         pteval = page_to_phys(sg_page(sg)) | prot;
1821                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1822                 }
1823
1824                 if (!pte) {
1825                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1826
1827                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1828                         if (!pte)
1829                                 return -ENOMEM;
1830                         /* It is large page*/
1831                         if (largepage_lvl > 1) {
1832                                 pteval |= DMA_PTE_LARGE_PAGE;
1833                                 /* Ensure that old small page tables are removed to make room
1834                                    for superpage, if they exist. */
1835                                 dma_pte_clear_range(domain, iov_pfn,
1836                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1837                                 dma_pte_free_pagetable(domain, iov_pfn,
1838                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1839                         } else {
1840                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841                         }
1842
1843                 }
1844                 /* We don't need lock here, nobody else
1845                  * touches the iova range
1846                  */
1847                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1848                 if (tmp) {
1849                         static int dumps = 5;
1850                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851                                iov_pfn, tmp, (unsigned long long)pteval);
1852                         if (dumps) {
1853                                 dumps--;
1854                                 debug_dma_dump_mappings(NULL);
1855                         }
1856                         WARN_ON(1);
1857                 }
1858
1859                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1860
1861                 BUG_ON(nr_pages < lvl_pages);
1862                 BUG_ON(sg_res < lvl_pages);
1863
1864                 nr_pages -= lvl_pages;
1865                 iov_pfn += lvl_pages;
1866                 phys_pfn += lvl_pages;
1867                 pteval += lvl_pages * VTD_PAGE_SIZE;
1868                 sg_res -= lvl_pages;
1869
1870                 /* If the next PTE would be the first in a new page, then we
1871                    need to flush the cache on the entries we've just written.
1872                    And then we'll need to recalculate 'pte', so clear it and
1873                    let it get set again in the if (!pte) block above.
1874
1875                    If we're done (!nr_pages) we need to flush the cache too.
1876
1877                    Also if we've been setting superpages, we may need to
1878                    recalculate 'pte' and switch back to smaller pages for the
1879                    end of the mapping, if the trailing size is not enough to
1880                    use another superpage (i.e. sg_res < lvl_pages). */
1881                 pte++;
1882                 if (!nr_pages || first_pte_in_page(pte) ||
1883                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1884                         domain_flush_cache(domain, first_pte,
1885                                            (void *)pte - (void *)first_pte);
1886                         pte = NULL;
1887                 }
1888
1889                 if (!sg_res && nr_pages)
1890                         sg = sg_next(sg);
1891         }
1892         return 0;
1893 }
1894
1895 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1896                                     struct scatterlist *sg, unsigned long nr_pages,
1897                                     int prot)
1898 {
1899         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1900 }
1901
1902 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1903                                      unsigned long phys_pfn, unsigned long nr_pages,
1904                                      int prot)
1905 {
1906         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1907 }
1908
1909 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1910 {
1911         if (!iommu)
1912                 return;
1913
1914         clear_context_table(iommu, bus, devfn);
1915         iommu->flush.flush_context(iommu, 0, 0, 0,
1916                                            DMA_CCMD_GLOBAL_INVL);
1917         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1918 }
1919
1920 static inline void unlink_domain_info(struct device_domain_info *info)
1921 {
1922         assert_spin_locked(&device_domain_lock);
1923         list_del(&info->link);
1924         list_del(&info->global);
1925         if (info->dev)
1926                 info->dev->dev.archdata.iommu = NULL;
1927 }
1928
1929 static void domain_remove_dev_info(struct dmar_domain *domain)
1930 {
1931         struct device_domain_info *info;
1932         unsigned long flags;
1933         struct intel_iommu *iommu;
1934
1935         spin_lock_irqsave(&device_domain_lock, flags);
1936         while (!list_empty(&domain->devices)) {
1937                 info = list_entry(domain->devices.next,
1938                         struct device_domain_info, link);
1939                 unlink_domain_info(info);
1940                 spin_unlock_irqrestore(&device_domain_lock, flags);
1941
1942                 iommu_disable_dev_iotlb(info);
1943                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1944                 iommu_detach_dev(iommu, info->bus, info->devfn);
1945                 free_devinfo_mem(info);
1946
1947                 spin_lock_irqsave(&device_domain_lock, flags);
1948         }
1949         spin_unlock_irqrestore(&device_domain_lock, flags);
1950 }
1951
1952 /*
1953  * find_domain
1954  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1955  */
1956 static struct dmar_domain *
1957 find_domain(struct pci_dev *pdev)
1958 {
1959         struct device_domain_info *info;
1960
1961         /* No lock here, assumes no domain exit in normal case */
1962         info = pdev->dev.archdata.iommu;
1963         if (info)
1964                 return info->domain;
1965         return NULL;
1966 }
1967
1968 /* domain is initialized */
1969 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1970 {
1971         struct dmar_domain *domain, *found = NULL;
1972         struct intel_iommu *iommu;
1973         struct dmar_drhd_unit *drhd;
1974         struct device_domain_info *info, *tmp;
1975         struct pci_dev *dev_tmp;
1976         unsigned long flags;
1977         int bus = 0, devfn = 0;
1978         int segment;
1979         int ret;
1980
1981         domain = find_domain(pdev);
1982         if (domain)
1983                 return domain;
1984
1985         segment = pci_domain_nr(pdev->bus);
1986
1987         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1988         if (dev_tmp) {
1989                 if (pci_is_pcie(dev_tmp)) {
1990                         bus = dev_tmp->subordinate->number;
1991                         devfn = 0;
1992                 } else {
1993                         bus = dev_tmp->bus->number;
1994                         devfn = dev_tmp->devfn;
1995                 }
1996                 spin_lock_irqsave(&device_domain_lock, flags);
1997                 list_for_each_entry(info, &device_domain_list, global) {
1998                         if (info->segment == segment &&
1999                             info->bus == bus && info->devfn == devfn) {
2000                                 found = info->domain;
2001                                 break;
2002                         }
2003                 }
2004                 spin_unlock_irqrestore(&device_domain_lock, flags);
2005                 /* pcie-pci bridge already has a domain, uses it */
2006                 if (found) {
2007                         domain = found;
2008                         goto found_domain;
2009                 }
2010         }
2011
2012         domain = alloc_domain();
2013         if (!domain)
2014                 goto error;
2015
2016         /* Allocate new domain for the device */
2017         drhd = dmar_find_matched_drhd_unit(pdev);
2018         if (!drhd) {
2019                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2020                         pci_name(pdev));
2021                 free_domain_mem(domain);
2022                 return NULL;
2023         }
2024         iommu = drhd->iommu;
2025
2026         ret = iommu_attach_domain(domain, iommu);
2027         if (ret) {
2028                 free_domain_mem(domain);
2029                 goto error;
2030         }
2031
2032         if (domain_init(domain, gaw)) {
2033                 domain_exit(domain);
2034                 goto error;
2035         }
2036
2037         /* register pcie-to-pci device */
2038         if (dev_tmp) {
2039                 info = alloc_devinfo_mem();
2040                 if (!info) {
2041                         domain_exit(domain);
2042                         goto error;
2043                 }
2044                 info->segment = segment;
2045                 info->bus = bus;
2046                 info->devfn = devfn;
2047                 info->dev = NULL;
2048                 info->domain = domain;
2049                 /* This domain is shared by devices under p2p bridge */
2050                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2051
2052                 /* pcie-to-pci bridge already has a domain, uses it */
2053                 found = NULL;
2054                 spin_lock_irqsave(&device_domain_lock, flags);
2055                 list_for_each_entry(tmp, &device_domain_list, global) {
2056                         if (tmp->segment == segment &&
2057                             tmp->bus == bus && tmp->devfn == devfn) {
2058                                 found = tmp->domain;
2059                                 break;
2060                         }
2061                 }
2062                 if (found) {
2063                         spin_unlock_irqrestore(&device_domain_lock, flags);
2064                         free_devinfo_mem(info);
2065                         domain_exit(domain);
2066                         domain = found;
2067                 } else {
2068                         list_add(&info->link, &domain->devices);
2069                         list_add(&info->global, &device_domain_list);
2070                         spin_unlock_irqrestore(&device_domain_lock, flags);
2071                 }
2072         }
2073
2074 found_domain:
2075         info = alloc_devinfo_mem();
2076         if (!info)
2077                 goto error;
2078         info->segment = segment;
2079         info->bus = pdev->bus->number;
2080         info->devfn = pdev->devfn;
2081         info->dev = pdev;
2082         info->domain = domain;
2083         spin_lock_irqsave(&device_domain_lock, flags);
2084         /* somebody is fast */
2085         found = find_domain(pdev);
2086         if (found != NULL) {
2087                 spin_unlock_irqrestore(&device_domain_lock, flags);
2088                 if (found != domain) {
2089                         domain_exit(domain);
2090                         domain = found;
2091                 }
2092                 free_devinfo_mem(info);
2093                 return domain;
2094         }
2095         list_add(&info->link, &domain->devices);
2096         list_add(&info->global, &device_domain_list);
2097         pdev->dev.archdata.iommu = info;
2098         spin_unlock_irqrestore(&device_domain_lock, flags);
2099         return domain;
2100 error:
2101         /* recheck it here, maybe others set it */
2102         return find_domain(pdev);
2103 }
2104
2105 static int iommu_identity_mapping;
2106 #define IDENTMAP_ALL            1
2107 #define IDENTMAP_GFX            2
2108 #define IDENTMAP_AZALIA         4
2109
2110 static int iommu_domain_identity_map(struct dmar_domain *domain,
2111                                      unsigned long long start,
2112                                      unsigned long long end)
2113 {
2114         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2115         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2116
2117         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2118                           dma_to_mm_pfn(last_vpfn))) {
2119                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2120                 return -ENOMEM;
2121         }
2122
2123         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2124                  start, end, domain->id);
2125         /*
2126          * RMRR range might have overlap with physical memory range,
2127          * clear it first
2128          */
2129         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2130
2131         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2132                                   last_vpfn - first_vpfn + 1,
2133                                   DMA_PTE_READ|DMA_PTE_WRITE);
2134 }
2135
2136 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2137                                       unsigned long long start,
2138                                       unsigned long long end)
2139 {
2140         struct dmar_domain *domain;
2141         int ret;
2142
2143         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2144         if (!domain)
2145                 return -ENOMEM;
2146
2147         /* For _hardware_ passthrough, don't bother. But for software
2148            passthrough, we do it anyway -- it may indicate a memory
2149            range which is reserved in E820, so which didn't get set
2150            up to start with in si_domain */
2151         if (domain == si_domain && hw_pass_through) {
2152                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2153                        pci_name(pdev), start, end);
2154                 return 0;
2155         }
2156
2157         printk(KERN_INFO
2158                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2159                pci_name(pdev), start, end);
2160         
2161         if (end < start) {
2162                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2163                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2164                         dmi_get_system_info(DMI_BIOS_VENDOR),
2165                         dmi_get_system_info(DMI_BIOS_VERSION),
2166                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2167                 ret = -EIO;
2168                 goto error;
2169         }
2170
2171         if (end >> agaw_to_width(domain->agaw)) {
2172                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2173                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2174                      agaw_to_width(domain->agaw),
2175                      dmi_get_system_info(DMI_BIOS_VENDOR),
2176                      dmi_get_system_info(DMI_BIOS_VERSION),
2177                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2178                 ret = -EIO;
2179                 goto error;
2180         }
2181
2182         ret = iommu_domain_identity_map(domain, start, end);
2183         if (ret)
2184                 goto error;
2185
2186         /* context entry init */
2187         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2188         if (ret)
2189                 goto error;
2190
2191         return 0;
2192
2193  error:
2194         domain_exit(domain);
2195         return ret;
2196 }
2197
2198 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2199         struct pci_dev *pdev)
2200 {
2201         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2202                 return 0;
2203         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2204                 rmrr->end_address);
2205 }
2206
2207 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2208 static inline void iommu_prepare_isa(void)
2209 {
2210         struct pci_dev *pdev;
2211         int ret;
2212
2213         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2214         if (!pdev)
2215                 return;
2216
2217         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2218         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2219
2220         if (ret)
2221                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2222                        "floppy might not work\n");
2223
2224 }
2225 #else
2226 static inline void iommu_prepare_isa(void)
2227 {
2228         return;
2229 }
2230 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2231
2232 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2233
2234 static int __init si_domain_init(int hw)
2235 {
2236         struct dmar_drhd_unit *drhd;
2237         struct intel_iommu *iommu;
2238         int nid, ret = 0;
2239
2240         si_domain = alloc_domain();
2241         if (!si_domain)
2242                 return -EFAULT;
2243
2244         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2245
2246         for_each_active_iommu(iommu, drhd) {
2247                 ret = iommu_attach_domain(si_domain, iommu);
2248                 if (ret) {
2249                         domain_exit(si_domain);
2250                         return -EFAULT;
2251                 }
2252         }
2253
2254         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2255                 domain_exit(si_domain);
2256                 return -EFAULT;
2257         }
2258
2259         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2260
2261         if (hw)
2262                 return 0;
2263
2264         for_each_online_node(nid) {
2265                 unsigned long start_pfn, end_pfn;
2266                 int i;
2267
2268                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2269                         ret = iommu_domain_identity_map(si_domain,
2270                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2271                         if (ret)
2272                                 return ret;
2273                 }
2274         }
2275
2276         return 0;
2277 }
2278
2279 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2280                                           struct pci_dev *pdev);
2281 static int identity_mapping(struct pci_dev *pdev)
2282 {
2283         struct device_domain_info *info;
2284
2285         if (likely(!iommu_identity_mapping))
2286                 return 0;
2287
2288         info = pdev->dev.archdata.iommu;
2289         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2290                 return (info->domain == si_domain);
2291
2292         return 0;
2293 }
2294
2295 static int domain_add_dev_info(struct dmar_domain *domain,
2296                                struct pci_dev *pdev,
2297                                int translation)
2298 {
2299         struct device_domain_info *info;
2300         unsigned long flags;
2301         int ret;
2302
2303         info = alloc_devinfo_mem();
2304         if (!info)
2305                 return -ENOMEM;
2306
2307         info->segment = pci_domain_nr(pdev->bus);
2308         info->bus = pdev->bus->number;
2309         info->devfn = pdev->devfn;
2310         info->dev = pdev;
2311         info->domain = domain;
2312
2313         spin_lock_irqsave(&device_domain_lock, flags);
2314         list_add(&info->link, &domain->devices);
2315         list_add(&info->global, &device_domain_list);
2316         pdev->dev.archdata.iommu = info;
2317         spin_unlock_irqrestore(&device_domain_lock, flags);
2318
2319         ret = domain_context_mapping(domain, pdev, translation);
2320         if (ret) {
2321                 spin_lock_irqsave(&device_domain_lock, flags);
2322                 unlink_domain_info(info);
2323                 spin_unlock_irqrestore(&device_domain_lock, flags);
2324                 free_devinfo_mem(info);
2325                 return ret;
2326         }
2327
2328         return 0;
2329 }
2330
2331 static bool device_has_rmrr(struct pci_dev *dev)
2332 {
2333         struct dmar_rmrr_unit *rmrr;
2334         int i;
2335
2336         for_each_rmrr_units(rmrr) {
2337                 for (i = 0; i < rmrr->devices_cnt; i++) {
2338                         /*
2339                          * Return TRUE if this RMRR contains the device that
2340                          * is passed in.
2341                          */
2342                         if (rmrr->devices[i] == dev)
2343                                 return true;
2344                 }
2345         }
2346         return false;
2347 }
2348
2349 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2350 {
2351
2352         /*
2353          * We want to prevent any device associated with an RMRR from
2354          * getting placed into the SI Domain. This is done because
2355          * problems exist when devices are moved in and out of domains
2356          * and their respective RMRR info is lost. We exempt USB devices
2357          * from this process due to their usage of RMRRs that are known
2358          * to not be needed after BIOS hand-off to OS.
2359          */
2360         if (device_has_rmrr(pdev) &&
2361             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2362                 return 0;
2363
2364         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2365                 return 1;
2366
2367         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2368                 return 1;
2369
2370         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2371                 return 0;
2372
2373         /*
2374          * We want to start off with all devices in the 1:1 domain, and
2375          * take them out later if we find they can't access all of memory.
2376          *
2377          * However, we can't do this for PCI devices behind bridges,
2378          * because all PCI devices behind the same bridge will end up
2379          * with the same source-id on their transactions.
2380          *
2381          * Practically speaking, we can't change things around for these
2382          * devices at run-time, because we can't be sure there'll be no
2383          * DMA transactions in flight for any of their siblings.
2384          * 
2385          * So PCI devices (unless they're on the root bus) as well as
2386          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2387          * the 1:1 domain, just in _case_ one of their siblings turns out
2388          * not to be able to map all of memory.
2389          */
2390         if (!pci_is_pcie(pdev)) {
2391                 if (!pci_is_root_bus(pdev->bus))
2392                         return 0;
2393                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2394                         return 0;
2395         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2396                 return 0;
2397
2398         /* 
2399          * At boot time, we don't yet know if devices will be 64-bit capable.
2400          * Assume that they will -- if they turn out not to be, then we can 
2401          * take them out of the 1:1 domain later.
2402          */
2403         if (!startup) {
2404                 /*
2405                  * If the device's dma_mask is less than the system's memory
2406                  * size then this is not a candidate for identity mapping.
2407                  */
2408                 u64 dma_mask = pdev->dma_mask;
2409
2410                 if (pdev->dev.coherent_dma_mask &&
2411                     pdev->dev.coherent_dma_mask < dma_mask)
2412                         dma_mask = pdev->dev.coherent_dma_mask;
2413
2414                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2415         }
2416
2417         return 1;
2418 }
2419
2420 static int __init iommu_prepare_static_identity_mapping(int hw)
2421 {
2422         struct pci_dev *pdev = NULL;
2423         int ret;
2424
2425         ret = si_domain_init(hw);
2426         if (ret)
2427                 return -EFAULT;
2428
2429         for_each_pci_dev(pdev) {
2430                 if (iommu_should_identity_map(pdev, 1)) {
2431                         ret = domain_add_dev_info(si_domain, pdev,
2432                                              hw ? CONTEXT_TT_PASS_THROUGH :
2433                                                   CONTEXT_TT_MULTI_LEVEL);
2434                         if (ret) {
2435                                 /* device not associated with an iommu */
2436                                 if (ret == -ENODEV)
2437                                         continue;
2438                                 return ret;
2439                         }
2440                         pr_info("IOMMU: %s identity mapping for device %s\n",
2441                                 hw ? "hardware" : "software", pci_name(pdev));
2442                 }
2443         }
2444
2445         return 0;
2446 }
2447
2448 static int __init init_dmars(void)
2449 {
2450         struct dmar_drhd_unit *drhd;
2451         struct dmar_rmrr_unit *rmrr;
2452         struct pci_dev *pdev;
2453         struct intel_iommu *iommu;
2454         int i, ret;
2455
2456         /*
2457          * for each drhd
2458          *    allocate root
2459          *    initialize and program root entry to not present
2460          * endfor
2461          */
2462         for_each_drhd_unit(drhd) {
2463                 /*
2464                  * lock not needed as this is only incremented in the single
2465                  * threaded kernel __init code path all other access are read
2466                  * only
2467                  */
2468                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2469                         g_num_of_iommus++;
2470                         continue;
2471                 }
2472                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2473                           IOMMU_UNITS_SUPPORTED);
2474         }
2475
2476         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2477                         GFP_KERNEL);
2478         if (!g_iommus) {
2479                 printk(KERN_ERR "Allocating global iommu array failed\n");
2480                 ret = -ENOMEM;
2481                 goto error;
2482         }
2483
2484         deferred_flush = kzalloc(g_num_of_iommus *
2485                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2486         if (!deferred_flush) {
2487                 ret = -ENOMEM;
2488                 goto error;
2489         }
2490
2491         for_each_drhd_unit(drhd) {
2492                 if (drhd->ignored)
2493                         continue;
2494
2495                 iommu = drhd->iommu;
2496                 g_iommus[iommu->seq_id] = iommu;
2497
2498                 ret = iommu_init_domains(iommu);
2499                 if (ret)
2500                         goto error;
2501
2502                 /*
2503                  * TBD:
2504                  * we could share the same root & context tables
2505                  * among all IOMMU's. Need to Split it later.
2506                  */
2507                 ret = iommu_alloc_root_entry(iommu);
2508                 if (ret) {
2509                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2510                         goto error;
2511                 }
2512                 if (!ecap_pass_through(iommu->ecap))
2513                         hw_pass_through = 0;
2514         }
2515
2516         /*
2517          * Start from the sane iommu hardware state.
2518          */
2519         for_each_drhd_unit(drhd) {
2520                 if (drhd->ignored)
2521                         continue;
2522
2523                 iommu = drhd->iommu;
2524
2525                 /*
2526                  * If the queued invalidation is already initialized by us
2527                  * (for example, while enabling interrupt-remapping) then
2528                  * we got the things already rolling from a sane state.
2529                  */
2530                 if (iommu->qi)
2531                         continue;
2532
2533                 /*
2534                  * Clear any previous faults.
2535                  */
2536                 dmar_fault(-1, iommu);
2537                 /*
2538                  * Disable queued invalidation if supported and already enabled
2539                  * before OS handover.
2540                  */
2541                 dmar_disable_qi(iommu);
2542         }
2543
2544         for_each_drhd_unit(drhd) {
2545                 if (drhd->ignored)
2546                         continue;
2547
2548                 iommu = drhd->iommu;
2549
2550                 if (dmar_enable_qi(iommu)) {
2551                         /*
2552                          * Queued Invalidate not enabled, use Register Based
2553                          * Invalidate
2554                          */
2555                         iommu->flush.flush_context = __iommu_flush_context;
2556                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2557                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2558                                "invalidation\n",
2559                                 iommu->seq_id,
2560                                (unsigned long long)drhd->reg_base_addr);
2561                 } else {
2562                         iommu->flush.flush_context = qi_flush_context;
2563                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2564                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2565                                "invalidation\n",
2566                                 iommu->seq_id,
2567                                (unsigned long long)drhd->reg_base_addr);
2568                 }
2569         }
2570
2571         if (iommu_pass_through)
2572                 iommu_identity_mapping |= IDENTMAP_ALL;
2573
2574 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2575         iommu_identity_mapping |= IDENTMAP_GFX;
2576 #endif
2577
2578         check_tylersburg_isoch();
2579
2580         /*
2581          * If pass through is not set or not enabled, setup context entries for
2582          * identity mappings for rmrr, gfx, and isa and may fall back to static
2583          * identity mapping if iommu_identity_mapping is set.
2584          */
2585         if (iommu_identity_mapping) {
2586                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2587                 if (ret) {
2588                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2589                         goto error;
2590                 }
2591         }
2592         /*
2593          * For each rmrr
2594          *   for each dev attached to rmrr
2595          *   do
2596          *     locate drhd for dev, alloc domain for dev
2597          *     allocate free domain
2598          *     allocate page table entries for rmrr
2599          *     if context not allocated for bus
2600          *           allocate and init context
2601          *           set present in root table for this bus
2602          *     init context with domain, translation etc
2603          *    endfor
2604          * endfor
2605          */
2606         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2607         for_each_rmrr_units(rmrr) {
2608                 for (i = 0; i < rmrr->devices_cnt; i++) {
2609                         pdev = rmrr->devices[i];
2610                         /*
2611                          * some BIOS lists non-exist devices in DMAR
2612                          * table.
2613                          */
2614                         if (!pdev)
2615                                 continue;
2616                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2617                         if (ret)
2618                                 printk(KERN_ERR
2619                                        "IOMMU: mapping reserved region failed\n");
2620                 }
2621         }
2622
2623         iommu_prepare_isa();
2624
2625         /*
2626          * for each drhd
2627          *   enable fault log
2628          *   global invalidate context cache
2629          *   global invalidate iotlb
2630          *   enable translation
2631          */
2632         for_each_drhd_unit(drhd) {
2633                 if (drhd->ignored) {
2634                         /*
2635                          * we always have to disable PMRs or DMA may fail on
2636                          * this device
2637                          */
2638                         if (force_on)
2639                                 iommu_disable_protect_mem_regions(drhd->iommu);
2640                         continue;
2641                 }
2642                 iommu = drhd->iommu;
2643
2644                 iommu_flush_write_buffer(iommu);
2645
2646                 ret = dmar_set_interrupt(iommu);
2647                 if (ret)
2648                         goto error;
2649
2650                 iommu_set_root_entry(iommu);
2651
2652                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2653                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2654
2655                 ret = iommu_enable_translation(iommu);
2656                 if (ret)
2657                         goto error;
2658
2659                 iommu_disable_protect_mem_regions(iommu);
2660         }
2661
2662         return 0;
2663 error:
2664         for_each_drhd_unit(drhd) {
2665                 if (drhd->ignored)
2666                         continue;
2667                 iommu = drhd->iommu;
2668                 free_iommu(iommu);
2669         }
2670         kfree(g_iommus);
2671         return ret;
2672 }
2673
2674 /* This takes a number of _MM_ pages, not VTD pages */
2675 static struct iova *intel_alloc_iova(struct device *dev,
2676                                      struct dmar_domain *domain,
2677                                      unsigned long nrpages, uint64_t dma_mask)
2678 {
2679         struct pci_dev *pdev = to_pci_dev(dev);
2680         struct iova *iova = NULL;
2681
2682         /* Restrict dma_mask to the width that the iommu can handle */
2683         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2684
2685         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2686                 /*
2687                  * First try to allocate an io virtual address in
2688                  * DMA_BIT_MASK(32) and if that fails then try allocating
2689                  * from higher range
2690                  */
2691                 iova = alloc_iova(&domain->iovad, nrpages,
2692                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2693                 if (iova)
2694                         return iova;
2695         }
2696         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2697         if (unlikely(!iova)) {
2698                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2699                        nrpages, pci_name(pdev));
2700                 return NULL;
2701         }
2702
2703         return iova;
2704 }
2705
2706 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2707 {
2708         struct dmar_domain *domain;
2709         int ret;
2710
2711         domain = get_domain_for_dev(pdev,
2712                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2713         if (!domain) {
2714                 printk(KERN_ERR
2715                         "Allocating domain for %s failed", pci_name(pdev));
2716                 return NULL;
2717         }
2718
2719         /* make sure context mapping is ok */
2720         if (unlikely(!domain_context_mapped(pdev))) {
2721                 ret = domain_context_mapping(domain, pdev,
2722                                              CONTEXT_TT_MULTI_LEVEL);
2723                 if (ret) {
2724                         printk(KERN_ERR
2725                                 "Domain context map for %s failed",
2726                                 pci_name(pdev));
2727                         return NULL;
2728                 }
2729         }
2730
2731         return domain;
2732 }
2733
2734 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2735 {
2736         struct device_domain_info *info;
2737
2738         /* No lock here, assumes no domain exit in normal case */
2739         info = dev->dev.archdata.iommu;
2740         if (likely(info))
2741                 return info->domain;
2742
2743         return __get_valid_domain_for_dev(dev);
2744 }
2745
2746 static int iommu_dummy(struct pci_dev *pdev)
2747 {
2748         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2749 }
2750
2751 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2752 static int iommu_no_mapping(struct device *dev)
2753 {
2754         struct pci_dev *pdev;
2755         int found;
2756
2757         if (unlikely(dev->bus != &pci_bus_type))
2758                 return 1;
2759
2760         pdev = to_pci_dev(dev);
2761         if (iommu_dummy(pdev))
2762                 return 1;
2763
2764         if (!iommu_identity_mapping)
2765                 return 0;
2766
2767         found = identity_mapping(pdev);
2768         if (found) {
2769                 if (iommu_should_identity_map(pdev, 0))
2770                         return 1;
2771                 else {
2772                         /*
2773                          * 32 bit DMA is removed from si_domain and fall back
2774                          * to non-identity mapping.
2775                          */
2776                         domain_remove_one_dev_info(si_domain, pdev);
2777                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2778                                pci_name(pdev));
2779                         return 0;
2780                 }
2781         } else {
2782                 /*
2783                  * In case of a detached 64 bit DMA device from vm, the device
2784                  * is put into si_domain for identity mapping.
2785                  */
2786                 if (iommu_should_identity_map(pdev, 0)) {
2787                         int ret;
2788                         ret = domain_add_dev_info(si_domain, pdev,
2789                                                   hw_pass_through ?
2790                                                   CONTEXT_TT_PASS_THROUGH :
2791                                                   CONTEXT_TT_MULTI_LEVEL);
2792                         if (!ret) {
2793                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2794                                        pci_name(pdev));
2795                                 return 1;
2796                         }
2797                 }
2798         }
2799
2800         return 0;
2801 }
2802
2803 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2804                                      size_t size, int dir, u64 dma_mask)
2805 {
2806         struct pci_dev *pdev = to_pci_dev(hwdev);
2807         struct dmar_domain *domain;
2808         phys_addr_t start_paddr;
2809         struct iova *iova;
2810         int prot = 0;
2811         int ret;
2812         struct intel_iommu *iommu;
2813         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2814
2815         BUG_ON(dir == DMA_NONE);
2816
2817         if (iommu_no_mapping(hwdev))
2818                 return paddr;
2819
2820         domain = get_valid_domain_for_dev(pdev);
2821         if (!domain)
2822                 return 0;
2823
2824         iommu = domain_get_iommu(domain);
2825         size = aligned_nrpages(paddr, size);
2826
2827         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2828         if (!iova)
2829                 goto error;
2830
2831         /*
2832          * Check if DMAR supports zero-length reads on write only
2833          * mappings..
2834          */
2835         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2836                         !cap_zlr(iommu->cap))
2837                 prot |= DMA_PTE_READ;
2838         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2839                 prot |= DMA_PTE_WRITE;
2840         /*
2841          * paddr - (paddr + size) might be partial page, we should map the whole
2842          * page.  Note: if two part of one page are separately mapped, we
2843          * might have two guest_addr mapping to the same host paddr, but this
2844          * is not a big problem
2845          */
2846         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2847                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2848         if (ret)
2849                 goto error;
2850
2851         /* it's a non-present to present mapping. Only flush if caching mode */
2852         if (cap_caching_mode(iommu->cap))
2853                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2854         else
2855                 iommu_flush_write_buffer(iommu);
2856
2857         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2858         start_paddr += paddr & ~PAGE_MASK;
2859         return start_paddr;
2860
2861 error:
2862         if (iova)
2863                 __free_iova(&domain->iovad, iova);
2864         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2865                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2866         return 0;
2867 }
2868
2869 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2870                                  unsigned long offset, size_t size,
2871                                  enum dma_data_direction dir,
2872                                  struct dma_attrs *attrs)
2873 {
2874         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2875                                   dir, to_pci_dev(dev)->dma_mask);
2876 }
2877
2878 static void flush_unmaps(void)
2879 {
2880         int i, j;
2881
2882         timer_on = 0;
2883
2884         /* just flush them all */
2885         for (i = 0; i < g_num_of_iommus; i++) {
2886                 struct intel_iommu *iommu = g_iommus[i];
2887                 if (!iommu)
2888                         continue;
2889
2890                 if (!deferred_flush[i].next)
2891                         continue;
2892
2893                 /* In caching mode, global flushes turn emulation expensive */
2894                 if (!cap_caching_mode(iommu->cap))
2895                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2896                                          DMA_TLB_GLOBAL_FLUSH);
2897                 for (j = 0; j < deferred_flush[i].next; j++) {
2898                         unsigned long mask;
2899                         struct iova *iova = deferred_flush[i].iova[j];
2900                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2901
2902                         /* On real hardware multiple invalidations are expensive */
2903                         if (cap_caching_mode(iommu->cap))
2904                                 iommu_flush_iotlb_psi(iommu, domain->id,
2905                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2906                         else {
2907                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2908                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2909                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2910                         }
2911                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2912                 }
2913                 deferred_flush[i].next = 0;
2914         }
2915
2916         list_size = 0;
2917 }
2918
2919 static void flush_unmaps_timeout(unsigned long data)
2920 {
2921         unsigned long flags;
2922
2923         spin_lock_irqsave(&async_umap_flush_lock, flags);
2924         flush_unmaps();
2925         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2926 }
2927
2928 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2929 {
2930         unsigned long flags;
2931         int next, iommu_id;
2932         struct intel_iommu *iommu;
2933
2934         spin_lock_irqsave(&async_umap_flush_lock, flags);
2935         if (list_size == HIGH_WATER_MARK)
2936                 flush_unmaps();
2937
2938         iommu = domain_get_iommu(dom);
2939         iommu_id = iommu->seq_id;
2940
2941         next = deferred_flush[iommu_id].next;
2942         deferred_flush[iommu_id].domain[next] = dom;
2943         deferred_flush[iommu_id].iova[next] = iova;
2944         deferred_flush[iommu_id].next++;
2945
2946         if (!timer_on) {
2947                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2948                 timer_on = 1;
2949         }
2950         list_size++;
2951         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2952 }
2953
2954 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2955                              size_t size, enum dma_data_direction dir,
2956                              struct dma_attrs *attrs)
2957 {
2958         struct pci_dev *pdev = to_pci_dev(dev);
2959         struct dmar_domain *domain;
2960         unsigned long start_pfn, last_pfn;
2961         struct iova *iova;
2962         struct intel_iommu *iommu;
2963
2964         if (iommu_no_mapping(dev))
2965                 return;
2966
2967         domain = find_domain(pdev);
2968         BUG_ON(!domain);
2969
2970         iommu = domain_get_iommu(domain);
2971
2972         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2973         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2974                       (unsigned long long)dev_addr))
2975                 return;
2976
2977         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2978         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2979
2980         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2981                  pci_name(pdev), start_pfn, last_pfn);
2982
2983         /*  clear the whole page */
2984         dma_pte_clear_range(domain, start_pfn, last_pfn);
2985
2986         /* free page tables */
2987         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2988
2989         if (intel_iommu_strict) {
2990                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2991                                       last_pfn - start_pfn + 1, 0);
2992                 /* free iova */
2993                 __free_iova(&domain->iovad, iova);
2994         } else {
2995                 add_unmap(domain, iova);
2996                 /*
2997                  * queue up the release of the unmap to save the 1/6th of the
2998                  * cpu used up by the iotlb flush operation...
2999                  */
3000         }
3001 }
3002
3003 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3004                                   dma_addr_t *dma_handle, gfp_t flags,
3005                                   struct dma_attrs *attrs)
3006 {
3007         void *vaddr;
3008         int order;
3009
3010         size = PAGE_ALIGN(size);
3011         order = get_order(size);
3012
3013         if (!iommu_no_mapping(hwdev))
3014                 flags &= ~(GFP_DMA | GFP_DMA32);
3015         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3016                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3017                         flags |= GFP_DMA;
3018                 else
3019                         flags |= GFP_DMA32;
3020         }
3021
3022         vaddr = (void *)__get_free_pages(flags, order);
3023         if (!vaddr)
3024                 return NULL;
3025         memset(vaddr, 0, size);
3026
3027         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3028                                          DMA_BIDIRECTIONAL,
3029                                          hwdev->coherent_dma_mask);
3030         if (*dma_handle)
3031                 return vaddr;
3032         free_pages((unsigned long)vaddr, order);
3033         return NULL;
3034 }
3035
3036 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3037                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3038 {
3039         int order;
3040
3041         size = PAGE_ALIGN(size);
3042         order = get_order(size);
3043
3044         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3045         free_pages((unsigned long)vaddr, order);
3046 }
3047
3048 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3049                            int nelems, enum dma_data_direction dir,
3050                            struct dma_attrs *attrs)
3051 {
3052         struct pci_dev *pdev = to_pci_dev(hwdev);
3053         struct dmar_domain *domain;
3054         unsigned long start_pfn, last_pfn;
3055         struct iova *iova;
3056         struct intel_iommu *iommu;
3057
3058         if (iommu_no_mapping(hwdev))
3059                 return;
3060
3061         domain = find_domain(pdev);
3062         BUG_ON(!domain);
3063
3064         iommu = domain_get_iommu(domain);
3065
3066         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3067         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3068                       (unsigned long long)sglist[0].dma_address))
3069                 return;
3070
3071         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3072         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3073
3074         /*  clear the whole page */
3075         dma_pte_clear_range(domain, start_pfn, last_pfn);
3076
3077         /* free page tables */
3078         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3079
3080         if (intel_iommu_strict) {
3081                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3082                                       last_pfn - start_pfn + 1, 0);
3083                 /* free iova */
3084                 __free_iova(&domain->iovad, iova);
3085         } else {
3086                 add_unmap(domain, iova);
3087                 /*
3088                  * queue up the release of the unmap to save the 1/6th of the
3089                  * cpu used up by the iotlb flush operation...
3090                  */
3091         }
3092 }
3093
3094 static int intel_nontranslate_map_sg(struct device *hddev,
3095         struct scatterlist *sglist, int nelems, int dir)
3096 {
3097         int i;
3098         struct scatterlist *sg;
3099
3100         for_each_sg(sglist, sg, nelems, i) {
3101                 BUG_ON(!sg_page(sg));
3102                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3103                 sg->dma_length = sg->length;
3104         }
3105         return nelems;
3106 }
3107
3108 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3109                         enum dma_data_direction dir, struct dma_attrs *attrs)
3110 {
3111         int i;
3112         struct pci_dev *pdev = to_pci_dev(hwdev);
3113         struct dmar_domain *domain;
3114         size_t size = 0;
3115         int prot = 0;
3116         struct iova *iova = NULL;
3117         int ret;
3118         struct scatterlist *sg;
3119         unsigned long start_vpfn;
3120         struct intel_iommu *iommu;
3121
3122         BUG_ON(dir == DMA_NONE);
3123         if (iommu_no_mapping(hwdev))
3124                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3125
3126         domain = get_valid_domain_for_dev(pdev);
3127         if (!domain)
3128                 return 0;
3129
3130         iommu = domain_get_iommu(domain);
3131
3132         for_each_sg(sglist, sg, nelems, i)
3133                 size += aligned_nrpages(sg->offset, sg->length);
3134
3135         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3136                                 pdev->dma_mask);
3137         if (!iova) {
3138                 sglist->dma_length = 0;
3139                 return 0;
3140         }
3141
3142         /*
3143          * Check if DMAR supports zero-length reads on write only
3144          * mappings..
3145          */
3146         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3147                         !cap_zlr(iommu->cap))
3148                 prot |= DMA_PTE_READ;
3149         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3150                 prot |= DMA_PTE_WRITE;
3151
3152         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3153
3154         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3155         if (unlikely(ret)) {
3156                 /*  clear the page */
3157                 dma_pte_clear_range(domain, start_vpfn,
3158                                     start_vpfn + size - 1);
3159                 /* free page tables */
3160                 dma_pte_free_pagetable(domain, start_vpfn,
3161                                        start_vpfn + size - 1);
3162                 /* free iova */
3163                 __free_iova(&domain->iovad, iova);
3164                 return 0;
3165         }
3166
3167         /* it's a non-present to present mapping. Only flush if caching mode */
3168         if (cap_caching_mode(iommu->cap))
3169                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3170         else
3171                 iommu_flush_write_buffer(iommu);
3172
3173         return nelems;
3174 }
3175
3176 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3177 {
3178         return !dma_addr;
3179 }
3180
3181 struct dma_map_ops intel_dma_ops = {
3182         .alloc = intel_alloc_coherent,
3183         .free = intel_free_coherent,
3184         .map_sg = intel_map_sg,
3185         .unmap_sg = intel_unmap_sg,
3186         .map_page = intel_map_page,
3187         .unmap_page = intel_unmap_page,
3188         .mapping_error = intel_mapping_error,
3189 };
3190
3191 static inline int iommu_domain_cache_init(void)
3192 {
3193         int ret = 0;
3194
3195         iommu_domain_cache = kmem_cache_create("iommu_domain",
3196                                          sizeof(struct dmar_domain),
3197                                          0,
3198                                          SLAB_HWCACHE_ALIGN,
3199
3200                                          NULL);
3201         if (!iommu_domain_cache) {
3202                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3203                 ret = -ENOMEM;
3204         }
3205
3206         return ret;
3207 }
3208
3209 static inline int iommu_devinfo_cache_init(void)
3210 {
3211         int ret = 0;
3212
3213         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3214                                          sizeof(struct device_domain_info),
3215                                          0,
3216                                          SLAB_HWCACHE_ALIGN,
3217                                          NULL);
3218         if (!iommu_devinfo_cache) {
3219                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3220                 ret = -ENOMEM;
3221         }
3222
3223         return ret;
3224 }
3225
3226 static inline int iommu_iova_cache_init(void)
3227 {
3228         int ret = 0;
3229
3230         iommu_iova_cache = kmem_cache_create("iommu_iova",
3231                                          sizeof(struct iova),
3232                                          0,
3233                                          SLAB_HWCACHE_ALIGN,
3234                                          NULL);
3235         if (!iommu_iova_cache) {
3236                 printk(KERN_ERR "Couldn't create iova cache\n");
3237                 ret = -ENOMEM;
3238         }
3239
3240         return ret;
3241 }
3242
3243 static int __init iommu_init_mempool(void)
3244 {
3245         int ret;
3246         ret = iommu_iova_cache_init();
3247         if (ret)
3248                 return ret;
3249
3250         ret = iommu_domain_cache_init();
3251         if (ret)
3252                 goto domain_error;
3253
3254         ret = iommu_devinfo_cache_init();
3255         if (!ret)
3256                 return ret;
3257
3258         kmem_cache_destroy(iommu_domain_cache);
3259 domain_error:
3260         kmem_cache_destroy(iommu_iova_cache);
3261
3262         return -ENOMEM;
3263 }
3264
3265 static void __init iommu_exit_mempool(void)
3266 {
3267         kmem_cache_destroy(iommu_devinfo_cache);
3268         kmem_cache_destroy(iommu_domain_cache);
3269         kmem_cache_destroy(iommu_iova_cache);
3270
3271 }
3272
3273 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3274 {
3275         struct dmar_drhd_unit *drhd;
3276         u32 vtbar;
3277         int rc;
3278
3279         /* We know that this device on this chipset has its own IOMMU.
3280          * If we find it under a different IOMMU, then the BIOS is lying
3281          * to us. Hope that the IOMMU for this device is actually
3282          * disabled, and it needs no translation...
3283          */
3284         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3285         if (rc) {
3286                 /* "can't" happen */
3287                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3288                 return;
3289         }
3290         vtbar &= 0xffff0000;
3291
3292         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3293         drhd = dmar_find_matched_drhd_unit(pdev);
3294         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3295                             TAINT_FIRMWARE_WORKAROUND,
3296                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3297                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3298 }
3299 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3300
3301 static void __init init_no_remapping_devices(void)
3302 {
3303         struct dmar_drhd_unit *drhd;
3304
3305         for_each_drhd_unit(drhd) {
3306                 if (!drhd->include_all) {
3307                         int i;
3308                         for (i = 0; i < drhd->devices_cnt; i++)
3309                                 if (drhd->devices[i] != NULL)
3310                                         break;
3311                         /* ignore DMAR unit if no pci devices exist */
3312                         if (i == drhd->devices_cnt)
3313                                 drhd->ignored = 1;
3314                 }
3315         }
3316
3317         for_each_drhd_unit(drhd) {
3318                 int i;
3319                 if (drhd->ignored || drhd->include_all)
3320                         continue;
3321
3322                 for (i = 0; i < drhd->devices_cnt; i++)
3323                         if (drhd->devices[i] &&
3324                             !IS_GFX_DEVICE(drhd->devices[i]))
3325                                 break;
3326
3327                 if (i < drhd->devices_cnt)
3328                         continue;
3329
3330                 /* This IOMMU has *only* gfx devices. Either bypass it or
3331                    set the gfx_mapped flag, as appropriate */
3332                 if (dmar_map_gfx) {
3333                         intel_iommu_gfx_mapped = 1;
3334                 } else {
3335                         drhd->ignored = 1;
3336                         for (i = 0; i < drhd->devices_cnt; i++) {
3337                                 if (!drhd->devices[i])
3338                                         continue;
3339                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3340                         }
3341                 }
3342         }
3343 }
3344
3345 #ifdef CONFIG_SUSPEND
3346 static int init_iommu_hw(void)
3347 {
3348         struct dmar_drhd_unit *drhd;
3349         struct intel_iommu *iommu = NULL;
3350
3351         for_each_active_iommu(iommu, drhd)
3352                 if (iommu->qi)
3353                         dmar_reenable_qi(iommu);
3354
3355         for_each_iommu(iommu, drhd) {
3356                 if (drhd->ignored) {
3357                         /*
3358                          * we always have to disable PMRs or DMA may fail on
3359                          * this device
3360                          */
3361                         if (force_on)
3362                                 iommu_disable_protect_mem_regions(iommu);
3363                         continue;
3364                 }
3365         
3366                 iommu_flush_write_buffer(iommu);
3367
3368                 iommu_set_root_entry(iommu);
3369
3370                 iommu->flush.flush_context(iommu, 0, 0, 0,
3371                                            DMA_CCMD_GLOBAL_INVL);
3372                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3373                                          DMA_TLB_GLOBAL_FLUSH);
3374                 if (iommu_enable_translation(iommu))
3375                         return 1;
3376                 iommu_disable_protect_mem_regions(iommu);
3377         }
3378
3379         return 0;
3380 }
3381
3382 static void iommu_flush_all(void)
3383 {
3384         struct dmar_drhd_unit *drhd;
3385         struct intel_iommu *iommu;
3386
3387         for_each_active_iommu(iommu, drhd) {
3388                 iommu->flush.flush_context(iommu, 0, 0, 0,
3389                                            DMA_CCMD_GLOBAL_INVL);
3390                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3391                                          DMA_TLB_GLOBAL_FLUSH);
3392         }
3393 }
3394
3395 static int iommu_suspend(void)
3396 {
3397         struct dmar_drhd_unit *drhd;
3398         struct intel_iommu *iommu = NULL;
3399         unsigned long flag;
3400
3401         for_each_active_iommu(iommu, drhd) {
3402                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3403                                                  GFP_ATOMIC);
3404                 if (!iommu->iommu_state)
3405                         goto nomem;
3406         }
3407
3408         iommu_flush_all();
3409
3410         for_each_active_iommu(iommu, drhd) {
3411                 iommu_disable_translation(iommu);
3412
3413                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3414
3415                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3416                         readl(iommu->reg + DMAR_FECTL_REG);
3417                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3418                         readl(iommu->reg + DMAR_FEDATA_REG);
3419                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3420                         readl(iommu->reg + DMAR_FEADDR_REG);
3421                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3422                         readl(iommu->reg + DMAR_FEUADDR_REG);
3423
3424                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3425         }
3426         return 0;
3427
3428 nomem:
3429         for_each_active_iommu(iommu, drhd)
3430                 kfree(iommu->iommu_state);
3431
3432         return -ENOMEM;
3433 }
3434
3435 static void iommu_resume(void)
3436 {
3437         struct dmar_drhd_unit *drhd;
3438         struct intel_iommu *iommu = NULL;
3439         unsigned long flag;
3440
3441         if (init_iommu_hw()) {
3442                 if (force_on)
3443                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3444                 else
3445                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3446                 return;
3447         }
3448
3449         for_each_active_iommu(iommu, drhd) {
3450
3451                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3452
3453                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3454                         iommu->reg + DMAR_FECTL_REG);
3455                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3456                         iommu->reg + DMAR_FEDATA_REG);
3457                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3458                         iommu->reg + DMAR_FEADDR_REG);
3459                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3460                         iommu->reg + DMAR_FEUADDR_REG);
3461
3462                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3463         }
3464
3465         for_each_active_iommu(iommu, drhd)
3466                 kfree(iommu->iommu_state);
3467 }
3468
3469 static struct syscore_ops iommu_syscore_ops = {
3470         .resume         = iommu_resume,
3471         .suspend        = iommu_suspend,
3472 };
3473
3474 static void __init init_iommu_pm_ops(void)
3475 {
3476         register_syscore_ops(&iommu_syscore_ops);
3477 }
3478
3479 #else
3480 static inline void init_iommu_pm_ops(void) {}
3481 #endif  /* CONFIG_PM */
3482
3483 LIST_HEAD(dmar_rmrr_units);
3484
3485 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3486 {
3487         list_add(&rmrr->list, &dmar_rmrr_units);
3488 }
3489
3490
3491 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3492 {
3493         struct acpi_dmar_reserved_memory *rmrr;
3494         struct dmar_rmrr_unit *rmrru;
3495
3496         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3497         if (!rmrru)
3498                 return -ENOMEM;
3499
3500         rmrru->hdr = header;
3501         rmrr = (struct acpi_dmar_reserved_memory *)header;
3502         rmrru->base_address = rmrr->base_address;
3503         rmrru->end_address = rmrr->end_address;
3504
3505         dmar_register_rmrr_unit(rmrru);
3506         return 0;
3507 }
3508
3509 static int __init
3510 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3511 {
3512         struct acpi_dmar_reserved_memory *rmrr;
3513         int ret;
3514
3515         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3516         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3517                 ((void *)rmrr) + rmrr->header.length,
3518                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3519
3520         if (ret || (rmrru->devices_cnt == 0)) {
3521                 list_del(&rmrru->list);
3522                 kfree(rmrru);
3523         }
3524         return ret;
3525 }
3526
3527 static LIST_HEAD(dmar_atsr_units);
3528
3529 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3530 {
3531         struct acpi_dmar_atsr *atsr;
3532         struct dmar_atsr_unit *atsru;
3533
3534         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3535         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3536         if (!atsru)
3537                 return -ENOMEM;
3538
3539         atsru->hdr = hdr;
3540         atsru->include_all = atsr->flags & 0x1;
3541
3542         list_add(&atsru->list, &dmar_atsr_units);
3543
3544         return 0;
3545 }
3546
3547 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3548 {
3549         int rc;
3550         struct acpi_dmar_atsr *atsr;
3551
3552         if (atsru->include_all)
3553                 return 0;
3554
3555         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3556         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3557                                 (void *)atsr + atsr->header.length,
3558                                 &atsru->devices_cnt, &atsru->devices,
3559                                 atsr->segment);
3560         if (rc || !atsru->devices_cnt) {
3561                 list_del(&atsru->list);
3562                 kfree(atsru);
3563         }
3564
3565         return rc;
3566 }
3567
3568 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3569 {
3570         int i;
3571         struct pci_bus *bus;
3572         struct acpi_dmar_atsr *atsr;
3573         struct dmar_atsr_unit *atsru;
3574
3575         dev = pci_physfn(dev);
3576
3577         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3578                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3579                 if (atsr->segment == pci_domain_nr(dev->bus))
3580                         goto found;
3581         }
3582
3583         return 0;
3584
3585 found:
3586         for (bus = dev->bus; bus; bus = bus->parent) {
3587                 struct pci_dev *bridge = bus->self;
3588
3589                 if (!bridge || !pci_is_pcie(bridge) ||
3590                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3591                         return 0;
3592
3593                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3594                         for (i = 0; i < atsru->devices_cnt; i++)
3595                                 if (atsru->devices[i] == bridge)
3596                                         return 1;
3597                         break;
3598                 }
3599         }
3600
3601         if (atsru->include_all)
3602                 return 1;
3603
3604         return 0;
3605 }
3606
3607 int __init dmar_parse_rmrr_atsr_dev(void)
3608 {
3609         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3610         struct dmar_atsr_unit *atsr, *atsr_n;
3611         int ret = 0;
3612
3613         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3614                 ret = rmrr_parse_dev(rmrr);
3615                 if (ret)
3616                         return ret;
3617         }
3618
3619         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3620                 ret = atsr_parse_dev(atsr);
3621                 if (ret)
3622                         return ret;
3623         }
3624
3625         return ret;
3626 }
3627
3628 /*
3629  * Here we only respond to action of unbound device from driver.
3630  *
3631  * Added device is not attached to its DMAR domain here yet. That will happen
3632  * when mapping the device to iova.
3633  */
3634 static int device_notifier(struct notifier_block *nb,
3635                                   unsigned long action, void *data)
3636 {
3637         struct device *dev = data;
3638         struct pci_dev *pdev = to_pci_dev(dev);
3639         struct dmar_domain *domain;
3640
3641         if (iommu_no_mapping(dev))
3642                 return 0;
3643
3644         domain = find_domain(pdev);
3645         if (!domain)
3646                 return 0;
3647
3648         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3649                 domain_remove_one_dev_info(domain, pdev);
3650
3651                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3652                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3653                     list_empty(&domain->devices))
3654                         domain_exit(domain);
3655         }
3656
3657         return 0;
3658 }
3659
3660 static struct notifier_block device_nb = {
3661         .notifier_call = device_notifier,
3662 };
3663
3664 int __init intel_iommu_init(void)
3665 {
3666         int ret = 0;
3667         struct dmar_drhd_unit *drhd;
3668
3669         /* VT-d is required for a TXT/tboot launch, so enforce that */
3670         force_on = tboot_force_iommu();
3671
3672         if (dmar_table_init()) {
3673                 if (force_on)
3674                         panic("tboot: Failed to initialize DMAR table\n");
3675                 return  -ENODEV;
3676         }
3677
3678         /*
3679          * Disable translation if already enabled prior to OS handover.
3680          */
3681         for_each_drhd_unit(drhd) {
3682                 struct intel_iommu *iommu;
3683
3684                 if (drhd->ignored)
3685                         continue;
3686
3687                 iommu = drhd->iommu;
3688                 if (iommu->gcmd & DMA_GCMD_TE)
3689                         iommu_disable_translation(iommu);
3690         }
3691
3692         if (dmar_dev_scope_init() < 0) {
3693                 if (force_on)
3694                         panic("tboot: Failed to initialize DMAR device scope\n");
3695                 return  -ENODEV;
3696         }
3697
3698         if (no_iommu || dmar_disabled)
3699                 return -ENODEV;
3700
3701         if (iommu_init_mempool()) {
3702                 if (force_on)
3703                         panic("tboot: Failed to initialize iommu memory\n");
3704                 return  -ENODEV;
3705         }
3706
3707         if (list_empty(&dmar_rmrr_units))
3708                 printk(KERN_INFO "DMAR: No RMRR found\n");
3709
3710         if (list_empty(&dmar_atsr_units))
3711                 printk(KERN_INFO "DMAR: No ATSR found\n");
3712
3713         if (dmar_init_reserved_ranges()) {
3714                 if (force_on)
3715                         panic("tboot: Failed to reserve iommu ranges\n");
3716                 return  -ENODEV;
3717         }
3718
3719         init_no_remapping_devices();
3720
3721         ret = init_dmars();
3722         if (ret) {
3723                 if (force_on)
3724                         panic("tboot: Failed to initialize DMARs\n");
3725                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3726                 put_iova_domain(&reserved_iova_list);
3727                 iommu_exit_mempool();
3728                 return ret;
3729         }
3730         printk(KERN_INFO
3731         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3732
3733         init_timer(&unmap_timer);
3734 #ifdef CONFIG_SWIOTLB
3735         swiotlb = 0;
3736 #endif
3737         dma_ops = &intel_dma_ops;
3738
3739         init_iommu_pm_ops();
3740
3741         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3742
3743         bus_register_notifier(&pci_bus_type, &device_nb);
3744
3745         intel_iommu_enabled = 1;
3746
3747         return 0;
3748 }
3749
3750 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3751                                            struct pci_dev *pdev)
3752 {
3753         struct pci_dev *tmp, *parent;
3754
3755         if (!iommu || !pdev)
3756                 return;
3757
3758         /* dependent device detach */
3759         tmp = pci_find_upstream_pcie_bridge(pdev);
3760         /* Secondary interface's bus number and devfn 0 */
3761         if (tmp) {
3762                 parent = pdev->bus->self;
3763                 while (parent != tmp) {
3764                         iommu_detach_dev(iommu, parent->bus->number,
3765                                          parent->devfn);
3766                         parent = parent->bus->self;
3767                 }
3768                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3769                         iommu_detach_dev(iommu,
3770                                 tmp->subordinate->number, 0);
3771                 else /* this is a legacy PCI bridge */
3772                         iommu_detach_dev(iommu, tmp->bus->number,
3773                                          tmp->devfn);
3774         }
3775 }
3776
3777 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3778                                           struct pci_dev *pdev)
3779 {
3780         struct device_domain_info *info;
3781         struct intel_iommu *iommu;
3782         unsigned long flags;
3783         int found = 0;
3784         struct list_head *entry, *tmp;
3785
3786         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3787                                 pdev->devfn);
3788         if (!iommu)
3789                 return;
3790
3791         spin_lock_irqsave(&device_domain_lock, flags);
3792         list_for_each_safe(entry, tmp, &domain->devices) {
3793                 info = list_entry(entry, struct device_domain_info, link);
3794                 if (info->segment == pci_domain_nr(pdev->bus) &&
3795                     info->bus == pdev->bus->number &&
3796                     info->devfn == pdev->devfn) {
3797                         unlink_domain_info(info);
3798                         spin_unlock_irqrestore(&device_domain_lock, flags);
3799
3800                         iommu_disable_dev_iotlb(info);
3801                         iommu_detach_dev(iommu, info->bus, info->devfn);
3802                         iommu_detach_dependent_devices(iommu, pdev);
3803                         free_devinfo_mem(info);
3804
3805                         spin_lock_irqsave(&device_domain_lock, flags);
3806
3807                         if (found)
3808                                 break;
3809                         else
3810                                 continue;
3811                 }
3812
3813                 /* if there is no other devices under the same iommu
3814                  * owned by this domain, clear this iommu in iommu_bmp
3815                  * update iommu count and coherency
3816                  */
3817                 if (iommu == device_to_iommu(info->segment, info->bus,
3818                                             info->devfn))
3819                         found = 1;
3820         }
3821
3822         spin_unlock_irqrestore(&device_domain_lock, flags);
3823
3824         if (found == 0) {
3825                 unsigned long tmp_flags;
3826                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3827                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3828                 domain->iommu_count--;
3829                 domain_update_iommu_cap(domain);
3830                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3831
3832                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3833                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3834                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3835                         clear_bit(domain->id, iommu->domain_ids);
3836                         iommu->domains[domain->id] = NULL;
3837                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3838                 }
3839         }
3840 }
3841
3842 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3843 {
3844         struct device_domain_info *info;
3845         struct intel_iommu *iommu;
3846         unsigned long flags1, flags2;
3847
3848         spin_lock_irqsave(&device_domain_lock, flags1);
3849         while (!list_empty(&domain->devices)) {
3850                 info = list_entry(domain->devices.next,
3851                         struct device_domain_info, link);
3852                 unlink_domain_info(info);
3853                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3854
3855                 iommu_disable_dev_iotlb(info);
3856                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3857                 iommu_detach_dev(iommu, info->bus, info->devfn);
3858                 iommu_detach_dependent_devices(iommu, info->dev);
3859
3860                 /* clear this iommu in iommu_bmp, update iommu count
3861                  * and capabilities
3862                  */
3863                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3864                 if (test_and_clear_bit(iommu->seq_id,
3865                                        domain->iommu_bmp)) {
3866                         domain->iommu_count--;
3867                         domain_update_iommu_cap(domain);
3868                 }
3869                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3870
3871                 free_devinfo_mem(info);
3872                 spin_lock_irqsave(&device_domain_lock, flags1);
3873         }
3874         spin_unlock_irqrestore(&device_domain_lock, flags1);
3875 }
3876
3877 /* domain id for virtual machine, it won't be set in context */
3878 static unsigned long vm_domid;
3879
3880 static struct dmar_domain *iommu_alloc_vm_domain(void)
3881 {
3882         struct dmar_domain *domain;
3883
3884         domain = alloc_domain_mem();
3885         if (!domain)
3886                 return NULL;
3887
3888         domain->id = vm_domid++;
3889         domain->nid = -1;
3890         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3891         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3892
3893         return domain;
3894 }
3895
3896 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3897 {
3898         int adjust_width;
3899
3900         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3901         spin_lock_init(&domain->iommu_lock);
3902
3903         domain_reserve_special_ranges(domain);
3904
3905         /* calculate AGAW */
3906         domain->gaw = guest_width;
3907         adjust_width = guestwidth_to_adjustwidth(guest_width);
3908         domain->agaw = width_to_agaw(adjust_width);
3909
3910         INIT_LIST_HEAD(&domain->devices);
3911
3912         domain->iommu_count = 0;
3913         domain->iommu_coherency = 0;
3914         domain->iommu_snooping = 0;
3915         domain->iommu_superpage = 0;
3916         domain->max_addr = 0;
3917         domain->nid = -1;
3918
3919         /* always allocate the top pgd */
3920         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3921         if (!domain->pgd)
3922                 return -ENOMEM;
3923         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3924         return 0;
3925 }
3926
3927 static void iommu_free_vm_domain(struct dmar_domain *domain)
3928 {
3929         unsigned long flags;
3930         struct dmar_drhd_unit *drhd;
3931         struct intel_iommu *iommu;
3932         unsigned long i;
3933         unsigned long ndomains;
3934
3935         for_each_drhd_unit(drhd) {
3936                 if (drhd->ignored)
3937                         continue;
3938                 iommu = drhd->iommu;
3939
3940                 ndomains = cap_ndoms(iommu->cap);
3941                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3942                         if (iommu->domains[i] == domain) {
3943                                 spin_lock_irqsave(&iommu->lock, flags);
3944                                 clear_bit(i, iommu->domain_ids);
3945                                 iommu->domains[i] = NULL;
3946                                 spin_unlock_irqrestore(&iommu->lock, flags);
3947                                 break;
3948                         }
3949                 }
3950         }
3951 }
3952
3953 static void vm_domain_exit(struct dmar_domain *domain)
3954 {
3955         /* Domain 0 is reserved, so dont process it */
3956         if (!domain)
3957                 return;
3958
3959         vm_domain_remove_all_dev_info(domain);
3960         /* destroy iovas */
3961         put_iova_domain(&domain->iovad);
3962
3963         /* clear ptes */
3964         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3965
3966         /* free page tables */
3967         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3968
3969         iommu_free_vm_domain(domain);
3970         free_domain_mem(domain);
3971 }
3972
3973 static int intel_iommu_domain_init(struct iommu_domain *domain)
3974 {
3975         struct dmar_domain *dmar_domain;
3976
3977         dmar_domain = iommu_alloc_vm_domain();
3978         if (!dmar_domain) {
3979                 printk(KERN_ERR
3980                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3981                 return -ENOMEM;
3982         }
3983         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3984                 printk(KERN_ERR
3985                         "intel_iommu_domain_init() failed\n");
3986                 vm_domain_exit(dmar_domain);
3987                 return -ENOMEM;
3988         }
3989         domain_update_iommu_cap(dmar_domain);
3990         domain->priv = dmar_domain;
3991
3992         domain->geometry.aperture_start = 0;
3993         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3994         domain->geometry.force_aperture = true;
3995
3996         return 0;
3997 }
3998
3999 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4000 {
4001         struct dmar_domain *dmar_domain = domain->priv;
4002
4003         domain->priv = NULL;
4004         vm_domain_exit(dmar_domain);
4005 }
4006
4007 static int intel_iommu_attach_device(struct iommu_domain *domain,
4008                                      struct device *dev)
4009 {
4010         struct dmar_domain *dmar_domain = domain->priv;
4011         struct pci_dev *pdev = to_pci_dev(dev);
4012         struct intel_iommu *iommu;
4013         int addr_width;
4014
4015         /* normally pdev is not mapped */
4016         if (unlikely(domain_context_mapped(pdev))) {
4017                 struct dmar_domain *old_domain;
4018
4019                 old_domain = find_domain(pdev);
4020                 if (old_domain) {
4021                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4022                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4023                                 domain_remove_one_dev_info(old_domain, pdev);
4024                         else
4025                                 domain_remove_dev_info(old_domain);
4026                 }
4027         }
4028
4029         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4030                                 pdev->devfn);
4031         if (!iommu)
4032                 return -ENODEV;
4033
4034         /* check if this iommu agaw is sufficient for max mapped address */
4035         addr_width = agaw_to_width(iommu->agaw);
4036         if (addr_width > cap_mgaw(iommu->cap))
4037                 addr_width = cap_mgaw(iommu->cap);
4038
4039         if (dmar_domain->max_addr > (1LL << addr_width)) {
4040                 printk(KERN_ERR "%s: iommu width (%d) is not "
4041                        "sufficient for the mapped address (%llx)\n",
4042                        __func__, addr_width, dmar_domain->max_addr);
4043                 return -EFAULT;
4044         }
4045         dmar_domain->gaw = addr_width;
4046
4047         /*
4048          * Knock out extra levels of page tables if necessary
4049          */
4050         while (iommu->agaw < dmar_domain->agaw) {
4051                 struct dma_pte *pte;
4052
4053                 pte = dmar_domain->pgd;
4054                 if (dma_pte_present(pte)) {
4055                         dmar_domain->pgd = (struct dma_pte *)
4056                                 phys_to_virt(dma_pte_addr(pte));
4057                         free_pgtable_page(pte);
4058                 }
4059                 dmar_domain->agaw--;
4060         }
4061
4062         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4063 }
4064
4065 static void intel_iommu_detach_device(struct iommu_domain *domain,
4066                                       struct device *dev)
4067 {
4068         struct dmar_domain *dmar_domain = domain->priv;
4069         struct pci_dev *pdev = to_pci_dev(dev);
4070
4071         domain_remove_one_dev_info(dmar_domain, pdev);
4072 }
4073
4074 static int intel_iommu_map(struct iommu_domain *domain,
4075                            unsigned long iova, phys_addr_t hpa,
4076                            size_t size, int iommu_prot)
4077 {
4078         struct dmar_domain *dmar_domain = domain->priv;
4079         u64 max_addr;
4080         int prot = 0;
4081         int ret;
4082
4083         if (iommu_prot & IOMMU_READ)
4084                 prot |= DMA_PTE_READ;
4085         if (iommu_prot & IOMMU_WRITE)
4086                 prot |= DMA_PTE_WRITE;
4087         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4088                 prot |= DMA_PTE_SNP;
4089
4090         max_addr = iova + size;
4091         if (dmar_domain->max_addr < max_addr) {
4092                 u64 end;
4093
4094                 /* check if minimum agaw is sufficient for mapped address */
4095                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4096                 if (end < max_addr) {
4097                         printk(KERN_ERR "%s: iommu width (%d) is not "
4098                                "sufficient for the mapped address (%llx)\n",
4099                                __func__, dmar_domain->gaw, max_addr);
4100                         return -EFAULT;
4101                 }
4102                 dmar_domain->max_addr = max_addr;
4103         }
4104         /* Round up size to next multiple of PAGE_SIZE, if it and
4105            the low bits of hpa would take us onto the next page */
4106         size = aligned_nrpages(hpa, size);
4107         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4108                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4109         return ret;
4110 }
4111
4112 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4113                              unsigned long iova, size_t size)
4114 {
4115         struct dmar_domain *dmar_domain = domain->priv;
4116         int order;
4117
4118         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4119                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4120
4121         if (dmar_domain->max_addr == iova + size)
4122                 dmar_domain->max_addr = iova;
4123
4124         return PAGE_SIZE << order;
4125 }
4126
4127 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4128                                             dma_addr_t iova)
4129 {
4130         struct dmar_domain *dmar_domain = domain->priv;
4131         struct dma_pte *pte;
4132         u64 phys = 0;
4133
4134         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4135         if (pte)
4136                 phys = dma_pte_addr(pte);
4137
4138         return phys;
4139 }
4140
4141 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4142                                       unsigned long cap)
4143 {
4144         struct dmar_domain *dmar_domain = domain->priv;
4145
4146         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4147                 return dmar_domain->iommu_snooping;
4148         if (cap == IOMMU_CAP_INTR_REMAP)
4149                 return irq_remapping_enabled;
4150
4151         return 0;
4152 }
4153
4154 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4155
4156 static int intel_iommu_add_device(struct device *dev)
4157 {
4158         struct pci_dev *pdev = to_pci_dev(dev);
4159         struct pci_dev *bridge, *dma_pdev = NULL;
4160         struct iommu_group *group;
4161         int ret;
4162
4163         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4164                              pdev->bus->number, pdev->devfn))
4165                 return -ENODEV;
4166
4167         bridge = pci_find_upstream_pcie_bridge(pdev);
4168         if (bridge) {
4169                 if (pci_is_pcie(bridge))
4170                         dma_pdev = pci_get_domain_bus_and_slot(
4171                                                 pci_domain_nr(pdev->bus),
4172                                                 bridge->subordinate->number, 0);
4173                 if (!dma_pdev)
4174                         dma_pdev = pci_dev_get(bridge);
4175         } else
4176                 dma_pdev = pci_dev_get(pdev);
4177
4178         /* Account for quirked devices */
4179         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4180
4181         /*
4182          * If it's a multifunction device that does not support our
4183          * required ACS flags, add to the same group as lowest numbered
4184          * function that also does not suport the required ACS flags.
4185          */
4186         if (dma_pdev->multifunction &&
4187             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4188                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4189
4190                 for (i = 0; i < 8; i++) {
4191                         struct pci_dev *tmp;
4192
4193                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4194                         if (!tmp)
4195                                 continue;
4196
4197                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4198                                 swap_pci_ref(&dma_pdev, tmp);
4199                                 break;
4200                         }
4201                         pci_dev_put(tmp);
4202                 }
4203         }
4204
4205         /*
4206          * Devices on the root bus go through the iommu.  If that's not us,
4207          * find the next upstream device and test ACS up to the root bus.
4208          * Finding the next device may require skipping virtual buses.
4209          */
4210         while (!pci_is_root_bus(dma_pdev->bus)) {
4211                 struct pci_bus *bus = dma_pdev->bus;
4212
4213                 while (!bus->self) {
4214                         if (!pci_is_root_bus(bus))
4215                                 bus = bus->parent;
4216                         else
4217                                 goto root_bus;
4218                 }
4219
4220                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4221                         break;
4222
4223                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4224         }
4225
4226 root_bus:
4227         group = iommu_group_get(&dma_pdev->dev);
4228         pci_dev_put(dma_pdev);
4229         if (!group) {
4230                 group = iommu_group_alloc();
4231                 if (IS_ERR(group))
4232                         return PTR_ERR(group);
4233         }
4234
4235         ret = iommu_group_add_device(group, dev);
4236
4237         iommu_group_put(group);
4238         return ret;
4239 }
4240
4241 static void intel_iommu_remove_device(struct device *dev)
4242 {
4243         iommu_group_remove_device(dev);
4244 }
4245
4246 static struct iommu_ops intel_iommu_ops = {
4247         .domain_init    = intel_iommu_domain_init,
4248         .domain_destroy = intel_iommu_domain_destroy,
4249         .attach_dev     = intel_iommu_attach_device,
4250         .detach_dev     = intel_iommu_detach_device,
4251         .map            = intel_iommu_map,
4252         .unmap          = intel_iommu_unmap,
4253         .iova_to_phys   = intel_iommu_iova_to_phys,
4254         .domain_has_cap = intel_iommu_domain_has_cap,
4255         .add_device     = intel_iommu_add_device,
4256         .remove_device  = intel_iommu_remove_device,
4257         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4258 };
4259
4260 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4261 {
4262         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4263         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4264         dmar_map_gfx = 0;
4265 }
4266
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4273 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4274
4275 static void quirk_iommu_rwbf(struct pci_dev *dev)
4276 {
4277         /*
4278          * Mobile 4 Series Chipset neglects to set RWBF capability,
4279          * but needs it. Same seems to hold for the desktop versions.
4280          */
4281         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4282         rwbf_quirk = 1;
4283 }
4284
4285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4286 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4292
4293 #define GGC 0x52
4294 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4295 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4296 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4297 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4298 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4299 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4300 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4301 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4302
4303 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4304 {
4305         unsigned short ggc;
4306
4307         if (pci_read_config_word(dev, GGC, &ggc))
4308                 return;
4309
4310         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4311                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4312                 dmar_map_gfx = 0;
4313         } else if (dmar_map_gfx) {
4314                 /* we have to ensure the gfx device is idle before we flush */
4315                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4316                 intel_iommu_strict = 1;
4317        }
4318 }
4319 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4323
4324 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4325    ISOCH DMAR unit for the Azalia sound device, but not give it any
4326    TLB entries, which causes it to deadlock. Check for that.  We do
4327    this in a function called from init_dmars(), instead of in a PCI
4328    quirk, because we don't want to print the obnoxious "BIOS broken"
4329    message if VT-d is actually disabled.
4330 */
4331 static void __init check_tylersburg_isoch(void)
4332 {
4333         struct pci_dev *pdev;
4334         uint32_t vtisochctrl;
4335
4336         /* If there's no Azalia in the system anyway, forget it. */
4337         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4338         if (!pdev)
4339                 return;
4340         pci_dev_put(pdev);
4341
4342         /* System Management Registers. Might be hidden, in which case
4343            we can't do the sanity check. But that's OK, because the
4344            known-broken BIOSes _don't_ actually hide it, so far. */
4345         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4346         if (!pdev)
4347                 return;
4348
4349         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4350                 pci_dev_put(pdev);
4351                 return;
4352         }
4353
4354         pci_dev_put(pdev);
4355
4356         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4357         if (vtisochctrl & 1)
4358                 return;
4359
4360         /* Drop all bits other than the number of TLB entries */
4361         vtisochctrl &= 0x1c;
4362
4363         /* If we have the recommended number of TLB entries (16), fine. */
4364         if (vtisochctrl == 0x10)
4365                 return;
4366
4367         /* Zero TLB entries? You get to ride the short bus to school. */
4368         if (!vtisochctrl) {
4369                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4370                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4371                      dmi_get_system_info(DMI_BIOS_VENDOR),
4372                      dmi_get_system_info(DMI_BIOS_VERSION),
4373                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4374                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4375                 return;
4376         }
4377         
4378         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4379                vtisochctrl);
4380 }