]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
Merge branches 'iommu/fixes', 'dma-debug', 'x86/amd', 'x86/vt-d', 'arm/tegra' and...
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
593
594         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
595
596         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
597                 if (!ecap_coherent(g_iommus[i]->ecap)) {
598                         domain->iommu_coherency = 0;
599                         break;
600                 }
601         }
602 }
603
604 static void domain_update_iommu_snooping(struct dmar_domain *domain)
605 {
606         int i;
607
608         domain->iommu_snooping = 1;
609
610         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
612                         domain->iommu_snooping = 0;
613                         break;
614                 }
615         }
616 }
617
618 static void domain_update_iommu_superpage(struct dmar_domain *domain)
619 {
620         struct dmar_drhd_unit *drhd;
621         struct intel_iommu *iommu = NULL;
622         int mask = 0xf;
623
624         if (!intel_iommu_superpage) {
625                 domain->iommu_superpage = 0;
626                 return;
627         }
628
629         /* set iommu_superpage to the smallest common denominator */
630         for_each_active_iommu(iommu, drhd) {
631                 mask &= cap_super_page_val(iommu->cap);
632                 if (!mask) {
633                         break;
634                 }
635         }
636         domain->iommu_superpage = fls(mask);
637 }
638
639 /* Some capabilities may be different across iommus */
640 static void domain_update_iommu_cap(struct dmar_domain *domain)
641 {
642         domain_update_iommu_coherency(domain);
643         domain_update_iommu_snooping(domain);
644         domain_update_iommu_superpage(domain);
645 }
646
647 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
648 {
649         struct dmar_drhd_unit *drhd = NULL;
650         int i;
651
652         for_each_drhd_unit(drhd) {
653                 if (drhd->ignored)
654                         continue;
655                 if (segment != drhd->segment)
656                         continue;
657
658                 for (i = 0; i < drhd->devices_cnt; i++) {
659                         if (drhd->devices[i] &&
660                             drhd->devices[i]->bus->number == bus &&
661                             drhd->devices[i]->devfn == devfn)
662                                 return drhd->iommu;
663                         if (drhd->devices[i] &&
664                             drhd->devices[i]->subordinate &&
665                             drhd->devices[i]->subordinate->number <= bus &&
666                             drhd->devices[i]->subordinate->busn_res.end >= bus)
667                                 return drhd->iommu;
668                 }
669
670                 if (drhd->include_all)
671                         return drhd->iommu;
672         }
673
674         return NULL;
675 }
676
677 static void domain_flush_cache(struct dmar_domain *domain,
678                                void *addr, int size)
679 {
680         if (!domain->iommu_coherency)
681                 clflush_cache_range(addr, size);
682 }
683
684 /* Gets context entry for a given bus and devfn */
685 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
686                 u8 bus, u8 devfn)
687 {
688         struct root_entry *root;
689         struct context_entry *context;
690         unsigned long phy_addr;
691         unsigned long flags;
692
693         spin_lock_irqsave(&iommu->lock, flags);
694         root = &iommu->root_entry[bus];
695         context = get_context_addr_from_root(root);
696         if (!context) {
697                 context = (struct context_entry *)
698                                 alloc_pgtable_page(iommu->node);
699                 if (!context) {
700                         spin_unlock_irqrestore(&iommu->lock, flags);
701                         return NULL;
702                 }
703                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704                 phy_addr = virt_to_phys((void *)context);
705                 set_root_value(root, phy_addr);
706                 set_root_present(root);
707                 __iommu_flush_cache(iommu, root, sizeof(*root));
708         }
709         spin_unlock_irqrestore(&iommu->lock, flags);
710         return &context[devfn];
711 }
712
713 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
714 {
715         struct root_entry *root;
716         struct context_entry *context;
717         int ret;
718         unsigned long flags;
719
720         spin_lock_irqsave(&iommu->lock, flags);
721         root = &iommu->root_entry[bus];
722         context = get_context_addr_from_root(root);
723         if (!context) {
724                 ret = 0;
725                 goto out;
726         }
727         ret = context_present(&context[devfn]);
728 out:
729         spin_unlock_irqrestore(&iommu->lock, flags);
730         return ret;
731 }
732
733 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
734 {
735         struct root_entry *root;
736         struct context_entry *context;
737         unsigned long flags;
738
739         spin_lock_irqsave(&iommu->lock, flags);
740         root = &iommu->root_entry[bus];
741         context = get_context_addr_from_root(root);
742         if (context) {
743                 context_clear_entry(&context[devfn]);
744                 __iommu_flush_cache(iommu, &context[devfn], \
745                         sizeof(*context));
746         }
747         spin_unlock_irqrestore(&iommu->lock, flags);
748 }
749
750 static void free_context_table(struct intel_iommu *iommu)
751 {
752         struct root_entry *root;
753         int i;
754         unsigned long flags;
755         struct context_entry *context;
756
757         spin_lock_irqsave(&iommu->lock, flags);
758         if (!iommu->root_entry) {
759                 goto out;
760         }
761         for (i = 0; i < ROOT_ENTRY_NR; i++) {
762                 root = &iommu->root_entry[i];
763                 context = get_context_addr_from_root(root);
764                 if (context)
765                         free_pgtable_page(context);
766         }
767         free_pgtable_page(iommu->root_entry);
768         iommu->root_entry = NULL;
769 out:
770         spin_unlock_irqrestore(&iommu->lock, flags);
771 }
772
773 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
774                                       unsigned long pfn, int target_level)
775 {
776         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
777         struct dma_pte *parent, *pte = NULL;
778         int level = agaw_to_level(domain->agaw);
779         int offset;
780
781         BUG_ON(!domain->pgd);
782         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
783         parent = domain->pgd;
784
785         while (level > 0) {
786                 void *tmp_page;
787
788                 offset = pfn_level_offset(pfn, level);
789                 pte = &parent[offset];
790                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791                         break;
792                 if (level == target_level)
793                         break;
794
795                 if (!dma_pte_present(pte)) {
796                         uint64_t pteval;
797
798                         tmp_page = alloc_pgtable_page(domain->nid);
799
800                         if (!tmp_page)
801                                 return NULL;
802
803                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
806                                 /* Someone else set it while we were thinking; use theirs. */
807                                 free_pgtable_page(tmp_page);
808                         } else {
809                                 dma_pte_addr(pte);
810                                 domain_flush_cache(domain, pte, sizeof(*pte));
811                         }
812                 }
813                 parent = phys_to_virt(dma_pte_addr(pte));
814                 level--;
815         }
816
817         return pte;
818 }
819
820
821 /* return address's pte at specific level */
822 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
823                                          unsigned long pfn,
824                                          int level, int *large_page)
825 {
826         struct dma_pte *parent, *pte = NULL;
827         int total = agaw_to_level(domain->agaw);
828         int offset;
829
830         parent = domain->pgd;
831         while (level <= total) {
832                 offset = pfn_level_offset(pfn, total);
833                 pte = &parent[offset];
834                 if (level == total)
835                         return pte;
836
837                 if (!dma_pte_present(pte)) {
838                         *large_page = total;
839                         break;
840                 }
841
842                 if (pte->val & DMA_PTE_LARGE_PAGE) {
843                         *large_page = total;
844                         return pte;
845                 }
846
847                 parent = phys_to_virt(dma_pte_addr(pte));
848                 total--;
849         }
850         return NULL;
851 }
852
853 /* clear last level pte, a tlb flush should be followed */
854 static int dma_pte_clear_range(struct dmar_domain *domain,
855                                 unsigned long start_pfn,
856                                 unsigned long last_pfn)
857 {
858         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
859         unsigned int large_page = 1;
860         struct dma_pte *first_pte, *pte;
861         int order;
862
863         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
864         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
865         BUG_ON(start_pfn > last_pfn);
866
867         /* we don't need lock here; nobody else touches the iova range */
868         do {
869                 large_page = 1;
870                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
871                 if (!pte) {
872                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
873                         continue;
874                 }
875                 do {
876                         dma_clear_pte(pte);
877                         start_pfn += lvl_to_nr_pages(large_page);
878                         pte++;
879                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
880
881                 domain_flush_cache(domain, first_pte,
882                                    (void *)pte - (void *)first_pte);
883
884         } while (start_pfn && start_pfn <= last_pfn);
885
886         order = (large_page - 1) * 9;
887         return order;
888 }
889
890 /* free page table pages. last level pte should already be cleared */
891 static void dma_pte_free_pagetable(struct dmar_domain *domain,
892                                    unsigned long start_pfn,
893                                    unsigned long last_pfn)
894 {
895         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896         struct dma_pte *first_pte, *pte;
897         int total = agaw_to_level(domain->agaw);
898         int level;
899         unsigned long tmp;
900         int large_page = 2;
901
902         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
903         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
904         BUG_ON(start_pfn > last_pfn);
905
906         /* We don't need lock here; nobody else touches the iova range */
907         level = 2;
908         while (level <= total) {
909                 tmp = align_to_level(start_pfn, level);
910
911                 /* If we can't even clear one PTE at this level, we're done */
912                 if (tmp + level_size(level) - 1 > last_pfn)
913                         return;
914
915                 do {
916                         large_page = level;
917                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
918                         if (large_page > level)
919                                 level = large_page + 1;
920                         if (!pte) {
921                                 tmp = align_to_level(tmp + 1, level + 1);
922                                 continue;
923                         }
924                         do {
925                                 if (dma_pte_present(pte)) {
926                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
927                                         dma_clear_pte(pte);
928                                 }
929                                 pte++;
930                                 tmp += level_size(level);
931                         } while (!first_pte_in_page(pte) &&
932                                  tmp + level_size(level) - 1 <= last_pfn);
933
934                         domain_flush_cache(domain, first_pte,
935                                            (void *)pte - (void *)first_pte);
936                         
937                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
938                 level++;
939         }
940         /* free pgd */
941         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
942                 free_pgtable_page(domain->pgd);
943                 domain->pgd = NULL;
944         }
945 }
946
947 /* iommu handling */
948 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
949 {
950         struct root_entry *root;
951         unsigned long flags;
952
953         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
954         if (!root)
955                 return -ENOMEM;
956
957         __iommu_flush_cache(iommu, root, ROOT_SIZE);
958
959         spin_lock_irqsave(&iommu->lock, flags);
960         iommu->root_entry = root;
961         spin_unlock_irqrestore(&iommu->lock, flags);
962
963         return 0;
964 }
965
966 static void iommu_set_root_entry(struct intel_iommu *iommu)
967 {
968         void *addr;
969         u32 sts;
970         unsigned long flag;
971
972         addr = iommu->root_entry;
973
974         raw_spin_lock_irqsave(&iommu->register_lock, flag);
975         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
976
977         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
978
979         /* Make sure hardware complete it */
980         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
981                       readl, (sts & DMA_GSTS_RTPS), sts);
982
983         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
984 }
985
986 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
987 {
988         u32 val;
989         unsigned long flag;
990
991         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
992                 return;
993
994         raw_spin_lock_irqsave(&iommu->register_lock, flag);
995         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
996
997         /* Make sure hardware complete it */
998         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
999                       readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002 }
1003
1004 /* return value determine if we need a write buffer flush */
1005 static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                   u16 did, u16 source_id, u8 function_mask,
1007                                   u64 type)
1008 {
1009         u64 val = 0;
1010         unsigned long flag;
1011
1012         switch (type) {
1013         case DMA_CCMD_GLOBAL_INVL:
1014                 val = DMA_CCMD_GLOBAL_INVL;
1015                 break;
1016         case DMA_CCMD_DOMAIN_INVL:
1017                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                 break;
1019         case DMA_CCMD_DEVICE_INVL:
1020                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                 break;
1023         default:
1024                 BUG();
1025         }
1026         val |= DMA_CCMD_ICC;
1027
1028         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031         /* Make sure hardware complete it */
1032         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 }
1037
1038 /* return value determine if we need a write buffer flush */
1039 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                 u64 addr, unsigned int size_order, u64 type)
1041 {
1042         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043         u64 val = 0, val_iva = 0;
1044         unsigned long flag;
1045
1046         switch (type) {
1047         case DMA_TLB_GLOBAL_FLUSH:
1048                 /* global flush doesn't need set IVA_REG */
1049                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                 break;
1051         case DMA_TLB_DSI_FLUSH:
1052                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                 break;
1054         case DMA_TLB_PSI_FLUSH:
1055                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                 /* Note: always flush non-leaf currently */
1057                 val_iva = size_order | addr;
1058                 break;
1059         default:
1060                 BUG();
1061         }
1062         /* Note: set drain read/write */
1063 #if 0
1064         /*
1065          * This is probably to be super secure.. Looks like we can
1066          * ignore it without any impact.
1067          */
1068         if (cap_read_drain(iommu->cap))
1069                 val |= DMA_TLB_READ_DRAIN;
1070 #endif
1071         if (cap_write_drain(iommu->cap))
1072                 val |= DMA_TLB_WRITE_DRAIN;
1073
1074         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075         /* Note: Only uses first TLB reg currently */
1076         if (val_iva)
1077                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080         /* Make sure hardware complete it */
1081         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086         /* check IOTLB invalidation granularity */
1087         if (DMA_TLB_IAIG(val) == 0)
1088                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                         (unsigned long long)DMA_TLB_IIRG(type),
1092                         (unsigned long long)DMA_TLB_IAIG(val));
1093 }
1094
1095 static struct device_domain_info *iommu_support_dev_iotlb(
1096         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097 {
1098         int found = 0;
1099         unsigned long flags;
1100         struct device_domain_info *info;
1101         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103         if (!ecap_dev_iotlb_support(iommu->ecap))
1104                 return NULL;
1105
1106         if (!iommu->qi)
1107                 return NULL;
1108
1109         spin_lock_irqsave(&device_domain_lock, flags);
1110         list_for_each_entry(info, &domain->devices, link)
1111                 if (info->bus == bus && info->devfn == devfn) {
1112                         found = 1;
1113                         break;
1114                 }
1115         spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117         if (!found || !info->dev)
1118                 return NULL;
1119
1120         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                 return NULL;
1122
1123         if (!dmar_find_matched_atsr_unit(info->dev))
1124                 return NULL;
1125
1126         info->iommu = iommu;
1127
1128         return info;
1129 }
1130
1131 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132 {
1133         if (!info)
1134                 return;
1135
1136         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137 }
1138
1139 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140 {
1141         if (!info->dev || !pci_ats_enabled(info->dev))
1142                 return;
1143
1144         pci_disable_ats(info->dev);
1145 }
1146
1147 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                   u64 addr, unsigned mask)
1149 {
1150         u16 sid, qdep;
1151         unsigned long flags;
1152         struct device_domain_info *info;
1153
1154         spin_lock_irqsave(&device_domain_lock, flags);
1155         list_for_each_entry(info, &domain->devices, link) {
1156                 if (!info->dev || !pci_ats_enabled(info->dev))
1157                         continue;
1158
1159                 sid = info->bus << 8 | info->devfn;
1160                 qdep = pci_ats_queue_depth(info->dev);
1161                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162         }
1163         spin_unlock_irqrestore(&device_domain_lock, flags);
1164 }
1165
1166 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                   unsigned long pfn, unsigned int pages, int map)
1168 {
1169         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172         BUG_ON(pages == 0);
1173
1174         /*
1175          * Fallback to domain selective flush if no PSI support or the size is
1176          * too big.
1177          * PSI requires page size to be 2 ^ x, and the base address is naturally
1178          * aligned to the size
1179          */
1180         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                 DMA_TLB_DSI_FLUSH);
1183         else
1184                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                 DMA_TLB_PSI_FLUSH);
1186
1187         /*
1188          * In caching mode, changes of pages from non-present to present require
1189          * flush. However, device IOTLB doesn't need to be flushed in this case.
1190          */
1191         if (!cap_caching_mode(iommu->cap) || !map)
1192                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193 }
1194
1195 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196 {
1197         u32 pmen;
1198         unsigned long flags;
1199
1200         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202         pmen &= ~DMA_PMEN_EPM;
1203         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205         /* wait for the protected region status bit to clear */
1206         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210 }
1211
1212 static int iommu_enable_translation(struct intel_iommu *iommu)
1213 {
1214         u32 sts;
1215         unsigned long flags;
1216
1217         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218         iommu->gcmd |= DMA_GCMD_TE;
1219         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_TES), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226         return 0;
1227 }
1228
1229 static int iommu_disable_translation(struct intel_iommu *iommu)
1230 {
1231         u32 sts;
1232         unsigned long flag;
1233
1234         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235         iommu->gcmd &= ~DMA_GCMD_TE;
1236         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238         /* Make sure hardware complete it */
1239         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                       readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243         return 0;
1244 }
1245
1246
1247 static int iommu_init_domains(struct intel_iommu *iommu)
1248 {
1249         unsigned long ndomains;
1250         unsigned long nlongs;
1251
1252         ndomains = cap_ndoms(iommu->cap);
1253         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                         ndomains);
1255         nlongs = BITS_TO_LONGS(ndomains);
1256
1257         spin_lock_init(&iommu->lock);
1258
1259         /* TBD: there might be 64K domains,
1260          * consider other allocation for future chip
1261          */
1262         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263         if (!iommu->domain_ids) {
1264                 printk(KERN_ERR "Allocating domain id array failed\n");
1265                 return -ENOMEM;
1266         }
1267         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                         GFP_KERNEL);
1269         if (!iommu->domains) {
1270                 printk(KERN_ERR "Allocating domain array failed\n");
1271                 return -ENOMEM;
1272         }
1273
1274         /*
1275          * if Caching mode is set, then invalid translations are tagged
1276          * with domainid 0. Hence we need to pre-allocate it.
1277          */
1278         if (cap_caching_mode(iommu->cap))
1279                 set_bit(0, iommu->domain_ids);
1280         return 0;
1281 }
1282
1283
1284 static void domain_exit(struct dmar_domain *domain);
1285 static void vm_domain_exit(struct dmar_domain *domain);
1286
1287 void free_dmar_iommu(struct intel_iommu *iommu)
1288 {
1289         struct dmar_domain *domain;
1290         int i;
1291         unsigned long flags;
1292
1293         if ((iommu->domains) && (iommu->domain_ids)) {
1294                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                         domain = iommu->domains[i];
1296                         clear_bit(i, iommu->domain_ids);
1297
1298                         spin_lock_irqsave(&domain->iommu_lock, flags);
1299                         if (--domain->iommu_count == 0) {
1300                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                         vm_domain_exit(domain);
1302                                 else
1303                                         domain_exit(domain);
1304                         }
1305                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                 }
1307         }
1308
1309         if (iommu->gcmd & DMA_GCMD_TE)
1310                 iommu_disable_translation(iommu);
1311
1312         if (iommu->irq) {
1313                 irq_set_handler_data(iommu->irq, NULL);
1314                 /* This will mask the irq */
1315                 free_irq(iommu->irq, iommu);
1316                 destroy_irq(iommu->irq);
1317         }
1318
1319         kfree(iommu->domains);
1320         kfree(iommu->domain_ids);
1321
1322         g_iommus[iommu->seq_id] = NULL;
1323
1324         /* if all iommus are freed, free g_iommus */
1325         for (i = 0; i < g_num_of_iommus; i++) {
1326                 if (g_iommus[i])
1327                         break;
1328         }
1329
1330         if (i == g_num_of_iommus)
1331                 kfree(g_iommus);
1332
1333         /* free context mapping */
1334         free_context_table(iommu);
1335 }
1336
1337 static struct dmar_domain *alloc_domain(void)
1338 {
1339         struct dmar_domain *domain;
1340
1341         domain = alloc_domain_mem();
1342         if (!domain)
1343                 return NULL;
1344
1345         domain->nid = -1;
1346         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347         domain->flags = 0;
1348
1349         return domain;
1350 }
1351
1352 static int iommu_attach_domain(struct dmar_domain *domain,
1353                                struct intel_iommu *iommu)
1354 {
1355         int num;
1356         unsigned long ndomains;
1357         unsigned long flags;
1358
1359         ndomains = cap_ndoms(iommu->cap);
1360
1361         spin_lock_irqsave(&iommu->lock, flags);
1362
1363         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364         if (num >= ndomains) {
1365                 spin_unlock_irqrestore(&iommu->lock, flags);
1366                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                 return -ENOMEM;
1368         }
1369
1370         domain->id = num;
1371         set_bit(num, iommu->domain_ids);
1372         set_bit(iommu->seq_id, domain->iommu_bmp);
1373         iommu->domains[num] = domain;
1374         spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376         return 0;
1377 }
1378
1379 static void iommu_detach_domain(struct dmar_domain *domain,
1380                                 struct intel_iommu *iommu)
1381 {
1382         unsigned long flags;
1383         int num, ndomains;
1384         int found = 0;
1385
1386         spin_lock_irqsave(&iommu->lock, flags);
1387         ndomains = cap_ndoms(iommu->cap);
1388         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                 if (iommu->domains[num] == domain) {
1390                         found = 1;
1391                         break;
1392                 }
1393         }
1394
1395         if (found) {
1396                 clear_bit(num, iommu->domain_ids);
1397                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                 iommu->domains[num] = NULL;
1399         }
1400         spin_unlock_irqrestore(&iommu->lock, flags);
1401 }
1402
1403 static struct iova_domain reserved_iova_list;
1404 static struct lock_class_key reserved_rbtree_key;
1405
1406 static int dmar_init_reserved_ranges(void)
1407 {
1408         struct pci_dev *pdev = NULL;
1409         struct iova *iova;
1410         int i;
1411
1412         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                 &reserved_rbtree_key);
1416
1417         /* IOAPIC ranges shouldn't be accessed by DMA */
1418         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                 IOVA_PFN(IOAPIC_RANGE_END));
1420         if (!iova) {
1421                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                 return -ENODEV;
1423         }
1424
1425         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426         for_each_pci_dev(pdev) {
1427                 struct resource *r;
1428
1429                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                         r = &pdev->resource[i];
1431                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                 continue;
1433                         iova = reserve_iova(&reserved_iova_list,
1434                                             IOVA_PFN(r->start),
1435                                             IOVA_PFN(r->end));
1436                         if (!iova) {
1437                                 printk(KERN_ERR "Reserve iova failed\n");
1438                                 return -ENODEV;
1439                         }
1440                 }
1441         }
1442         return 0;
1443 }
1444
1445 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446 {
1447         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448 }
1449
1450 static inline int guestwidth_to_adjustwidth(int gaw)
1451 {
1452         int agaw;
1453         int r = (gaw - 12) % 9;
1454
1455         if (r == 0)
1456                 agaw = gaw;
1457         else
1458                 agaw = gaw + 9 - r;
1459         if (agaw > 64)
1460                 agaw = 64;
1461         return agaw;
1462 }
1463
1464 static int domain_init(struct dmar_domain *domain, int guest_width)
1465 {
1466         struct intel_iommu *iommu;
1467         int adjust_width, agaw;
1468         unsigned long sagaw;
1469
1470         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471         spin_lock_init(&domain->iommu_lock);
1472
1473         domain_reserve_special_ranges(domain);
1474
1475         /* calculate AGAW */
1476         iommu = domain_get_iommu(domain);
1477         if (guest_width > cap_mgaw(iommu->cap))
1478                 guest_width = cap_mgaw(iommu->cap);
1479         domain->gaw = guest_width;
1480         adjust_width = guestwidth_to_adjustwidth(guest_width);
1481         agaw = width_to_agaw(adjust_width);
1482         sagaw = cap_sagaw(iommu->cap);
1483         if (!test_bit(agaw, &sagaw)) {
1484                 /* hardware doesn't support it, choose a bigger one */
1485                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                 agaw = find_next_bit(&sagaw, 5, agaw);
1487                 if (agaw >= 5)
1488                         return -ENODEV;
1489         }
1490         domain->agaw = agaw;
1491         INIT_LIST_HEAD(&domain->devices);
1492
1493         if (ecap_coherent(iommu->ecap))
1494                 domain->iommu_coherency = 1;
1495         else
1496                 domain->iommu_coherency = 0;
1497
1498         if (ecap_sc_support(iommu->ecap))
1499                 domain->iommu_snooping = 1;
1500         else
1501                 domain->iommu_snooping = 0;
1502
1503         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504         domain->iommu_count = 1;
1505         domain->nid = iommu->node;
1506
1507         /* always allocate the top pgd */
1508         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509         if (!domain->pgd)
1510                 return -ENOMEM;
1511         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512         return 0;
1513 }
1514
1515 static void domain_exit(struct dmar_domain *domain)
1516 {
1517         struct dmar_drhd_unit *drhd;
1518         struct intel_iommu *iommu;
1519
1520         /* Domain 0 is reserved, so dont process it */
1521         if (!domain)
1522                 return;
1523
1524         /* Flush any lazy unmaps that may reference this domain */
1525         if (!intel_iommu_strict)
1526                 flush_unmaps_timeout(0);
1527
1528         domain_remove_dev_info(domain);
1529         /* destroy iovas */
1530         put_iova_domain(&domain->iovad);
1531
1532         /* clear ptes */
1533         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535         /* free page tables */
1536         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538         for_each_active_iommu(iommu, drhd)
1539                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                         iommu_detach_domain(domain, iommu);
1541
1542         free_domain_mem(domain);
1543 }
1544
1545 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                  u8 bus, u8 devfn, int translation)
1547 {
1548         struct context_entry *context;
1549         unsigned long flags;
1550         struct intel_iommu *iommu;
1551         struct dma_pte *pgd;
1552         unsigned long num;
1553         unsigned long ndomains;
1554         int id;
1555         int agaw;
1556         struct device_domain_info *info = NULL;
1557
1558         pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561         BUG_ON(!domain->pgd);
1562         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563                translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565         iommu = device_to_iommu(segment, bus, devfn);
1566         if (!iommu)
1567                 return -ENODEV;
1568
1569         context = device_to_context_entry(iommu, bus, devfn);
1570         if (!context)
1571                 return -ENOMEM;
1572         spin_lock_irqsave(&iommu->lock, flags);
1573         if (context_present(context)) {
1574                 spin_unlock_irqrestore(&iommu->lock, flags);
1575                 return 0;
1576         }
1577
1578         id = domain->id;
1579         pgd = domain->pgd;
1580
1581         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                 int found = 0;
1584
1585                 /* find an available domain id for this device in iommu */
1586                 ndomains = cap_ndoms(iommu->cap);
1587                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                         if (iommu->domains[num] == domain) {
1589                                 id = num;
1590                                 found = 1;
1591                                 break;
1592                         }
1593                 }
1594
1595                 if (found == 0) {
1596                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                         if (num >= ndomains) {
1598                                 spin_unlock_irqrestore(&iommu->lock, flags);
1599                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                 return -EFAULT;
1601                         }
1602
1603                         set_bit(num, iommu->domain_ids);
1604                         iommu->domains[num] = domain;
1605                         id = num;
1606                 }
1607
1608                 /* Skip top levels of page tables for
1609                  * iommu which has less agaw than default.
1610                  * Unnecessary for PT mode.
1611                  */
1612                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                 if (!dma_pte_present(pgd)) {
1616                                         spin_unlock_irqrestore(&iommu->lock, flags);
1617                                         return -ENOMEM;
1618                                 }
1619                         }
1620                 }
1621         }
1622
1623         context_set_domain_id(context, id);
1624
1625         if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                      CONTEXT_TT_MULTI_LEVEL;
1629         }
1630         /*
1631          * In pass through mode, AW must be programmed to indicate the largest
1632          * AGAW value supported by hardware. And ASR is ignored by hardware.
1633          */
1634         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                 context_set_address_width(context, iommu->msagaw);
1636         else {
1637                 context_set_address_root(context, virt_to_phys(pgd));
1638                 context_set_address_width(context, iommu->agaw);
1639         }
1640
1641         context_set_translation_type(context, translation);
1642         context_set_fault_enable(context);
1643         context_set_present(context);
1644         domain_flush_cache(domain, context, sizeof(*context));
1645
1646         /*
1647          * It's a non-present to present mapping. If hardware doesn't cache
1648          * non-present entry we only need to flush the write-buffer. If the
1649          * _does_ cache non-present entries, then it does so in the special
1650          * domain #0, which we have to flush:
1651          */
1652         if (cap_caching_mode(iommu->cap)) {
1653                 iommu->flush.flush_context(iommu, 0,
1654                                            (((u16)bus) << 8) | devfn,
1655                                            DMA_CCMD_MASK_NOBIT,
1656                                            DMA_CCMD_DEVICE_INVL);
1657                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658         } else {
1659                 iommu_flush_write_buffer(iommu);
1660         }
1661         iommu_enable_dev_iotlb(info);
1662         spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664         spin_lock_irqsave(&domain->iommu_lock, flags);
1665         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                 domain->iommu_count++;
1667                 if (domain->iommu_count == 1)
1668                         domain->nid = iommu->node;
1669                 domain_update_iommu_cap(domain);
1670         }
1671         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672         return 0;
1673 }
1674
1675 static int
1676 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                         int translation)
1678 {
1679         int ret;
1680         struct pci_dev *tmp, *parent;
1681
1682         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                          pdev->bus->number, pdev->devfn,
1684                                          translation);
1685         if (ret)
1686                 return ret;
1687
1688         /* dependent device mapping */
1689         tmp = pci_find_upstream_pcie_bridge(pdev);
1690         if (!tmp)
1691                 return 0;
1692         /* Secondary interface's bus number and devfn 0 */
1693         parent = pdev->bus->self;
1694         while (parent != tmp) {
1695                 ret = domain_context_mapping_one(domain,
1696                                                  pci_domain_nr(parent->bus),
1697                                                  parent->bus->number,
1698                                                  parent->devfn, translation);
1699                 if (ret)
1700                         return ret;
1701                 parent = parent->bus->self;
1702         }
1703         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                 return domain_context_mapping_one(domain,
1705                                         pci_domain_nr(tmp->subordinate),
1706                                         tmp->subordinate->number, 0,
1707                                         translation);
1708         else /* this is a legacy PCI bridge */
1709                 return domain_context_mapping_one(domain,
1710                                                   pci_domain_nr(tmp->bus),
1711                                                   tmp->bus->number,
1712                                                   tmp->devfn,
1713                                                   translation);
1714 }
1715
1716 static int domain_context_mapped(struct pci_dev *pdev)
1717 {
1718         int ret;
1719         struct pci_dev *tmp, *parent;
1720         struct intel_iommu *iommu;
1721
1722         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                 pdev->devfn);
1724         if (!iommu)
1725                 return -ENODEV;
1726
1727         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728         if (!ret)
1729                 return ret;
1730         /* dependent device mapping */
1731         tmp = pci_find_upstream_pcie_bridge(pdev);
1732         if (!tmp)
1733                 return ret;
1734         /* Secondary interface's bus number and devfn 0 */
1735         parent = pdev->bus->self;
1736         while (parent != tmp) {
1737                 ret = device_context_mapped(iommu, parent->bus->number,
1738                                             parent->devfn);
1739                 if (!ret)
1740                         return ret;
1741                 parent = parent->bus->self;
1742         }
1743         if (pci_is_pcie(tmp))
1744                 return device_context_mapped(iommu, tmp->subordinate->number,
1745                                              0);
1746         else
1747                 return device_context_mapped(iommu, tmp->bus->number,
1748                                              tmp->devfn);
1749 }
1750
1751 /* Returns a number of VTD pages, but aligned to MM page size */
1752 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                             size_t size)
1754 {
1755         host_addr &= ~PAGE_MASK;
1756         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757 }
1758
1759 /* Return largest possible superpage level for a given mapping */
1760 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                           unsigned long iov_pfn,
1762                                           unsigned long phy_pfn,
1763                                           unsigned long pages)
1764 {
1765         int support, level = 1;
1766         unsigned long pfnmerge;
1767
1768         support = domain->iommu_superpage;
1769
1770         /* To use a large page, the virtual *and* physical addresses
1771            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772            of them will mean we have to use smaller pages. So just
1773            merge them and check both at once. */
1774         pfnmerge = iov_pfn | phy_pfn;
1775
1776         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                 pages >>= VTD_STRIDE_SHIFT;
1778                 if (!pages)
1779                         break;
1780                 pfnmerge >>= VTD_STRIDE_SHIFT;
1781                 level++;
1782                 support--;
1783         }
1784         return level;
1785 }
1786
1787 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                             struct scatterlist *sg, unsigned long phys_pfn,
1789                             unsigned long nr_pages, int prot)
1790 {
1791         struct dma_pte *first_pte = NULL, *pte = NULL;
1792         phys_addr_t uninitialized_var(pteval);
1793         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794         unsigned long sg_res;
1795         unsigned int largepage_lvl = 0;
1796         unsigned long lvl_pages = 0;
1797
1798         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                 return -EINVAL;
1802
1803         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805         if (sg)
1806                 sg_res = 0;
1807         else {
1808                 sg_res = nr_pages + 1;
1809                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810         }
1811
1812         while (nr_pages > 0) {
1813                 uint64_t tmp;
1814
1815                 if (!sg_res) {
1816                         sg_res = aligned_nrpages(sg->offset, sg->length);
1817                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                         sg->dma_length = sg->length;
1819                         pteval = page_to_phys(sg_page(sg)) | prot;
1820                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                 }
1822
1823                 if (!pte) {
1824                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                         if (!pte)
1828                                 return -ENOMEM;
1829                         /* It is large page*/
1830                         if (largepage_lvl > 1)
1831                                 pteval |= DMA_PTE_LARGE_PAGE;
1832                         else
1833                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1834
1835                 }
1836                 /* We don't need lock here, nobody else
1837                  * touches the iova range
1838                  */
1839                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1840                 if (tmp) {
1841                         static int dumps = 5;
1842                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1843                                iov_pfn, tmp, (unsigned long long)pteval);
1844                         if (dumps) {
1845                                 dumps--;
1846                                 debug_dma_dump_mappings(NULL);
1847                         }
1848                         WARN_ON(1);
1849                 }
1850
1851                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1852
1853                 BUG_ON(nr_pages < lvl_pages);
1854                 BUG_ON(sg_res < lvl_pages);
1855
1856                 nr_pages -= lvl_pages;
1857                 iov_pfn += lvl_pages;
1858                 phys_pfn += lvl_pages;
1859                 pteval += lvl_pages * VTD_PAGE_SIZE;
1860                 sg_res -= lvl_pages;
1861
1862                 /* If the next PTE would be the first in a new page, then we
1863                    need to flush the cache on the entries we've just written.
1864                    And then we'll need to recalculate 'pte', so clear it and
1865                    let it get set again in the if (!pte) block above.
1866
1867                    If we're done (!nr_pages) we need to flush the cache too.
1868
1869                    Also if we've been setting superpages, we may need to
1870                    recalculate 'pte' and switch back to smaller pages for the
1871                    end of the mapping, if the trailing size is not enough to
1872                    use another superpage (i.e. sg_res < lvl_pages). */
1873                 pte++;
1874                 if (!nr_pages || first_pte_in_page(pte) ||
1875                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1876                         domain_flush_cache(domain, first_pte,
1877                                            (void *)pte - (void *)first_pte);
1878                         pte = NULL;
1879                 }
1880
1881                 if (!sg_res && nr_pages)
1882                         sg = sg_next(sg);
1883         }
1884         return 0;
1885 }
1886
1887 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1888                                     struct scatterlist *sg, unsigned long nr_pages,
1889                                     int prot)
1890 {
1891         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1892 }
1893
1894 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                      unsigned long phys_pfn, unsigned long nr_pages,
1896                                      int prot)
1897 {
1898         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1899 }
1900
1901 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1902 {
1903         if (!iommu)
1904                 return;
1905
1906         clear_context_table(iommu, bus, devfn);
1907         iommu->flush.flush_context(iommu, 0, 0, 0,
1908                                            DMA_CCMD_GLOBAL_INVL);
1909         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1910 }
1911
1912 static inline void unlink_domain_info(struct device_domain_info *info)
1913 {
1914         assert_spin_locked(&device_domain_lock);
1915         list_del(&info->link);
1916         list_del(&info->global);
1917         if (info->dev)
1918                 info->dev->dev.archdata.iommu = NULL;
1919 }
1920
1921 static void domain_remove_dev_info(struct dmar_domain *domain)
1922 {
1923         struct device_domain_info *info;
1924         unsigned long flags;
1925         struct intel_iommu *iommu;
1926
1927         spin_lock_irqsave(&device_domain_lock, flags);
1928         while (!list_empty(&domain->devices)) {
1929                 info = list_entry(domain->devices.next,
1930                         struct device_domain_info, link);
1931                 unlink_domain_info(info);
1932                 spin_unlock_irqrestore(&device_domain_lock, flags);
1933
1934                 iommu_disable_dev_iotlb(info);
1935                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1936                 iommu_detach_dev(iommu, info->bus, info->devfn);
1937                 free_devinfo_mem(info);
1938
1939                 spin_lock_irqsave(&device_domain_lock, flags);
1940         }
1941         spin_unlock_irqrestore(&device_domain_lock, flags);
1942 }
1943
1944 /*
1945  * find_domain
1946  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1947  */
1948 static struct dmar_domain *
1949 find_domain(struct pci_dev *pdev)
1950 {
1951         struct device_domain_info *info;
1952
1953         /* No lock here, assumes no domain exit in normal case */
1954         info = pdev->dev.archdata.iommu;
1955         if (info)
1956                 return info->domain;
1957         return NULL;
1958 }
1959
1960 /* domain is initialized */
1961 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1962 {
1963         struct dmar_domain *domain, *found = NULL;
1964         struct intel_iommu *iommu;
1965         struct dmar_drhd_unit *drhd;
1966         struct device_domain_info *info, *tmp;
1967         struct pci_dev *dev_tmp;
1968         unsigned long flags;
1969         int bus = 0, devfn = 0;
1970         int segment;
1971         int ret;
1972
1973         domain = find_domain(pdev);
1974         if (domain)
1975                 return domain;
1976
1977         segment = pci_domain_nr(pdev->bus);
1978
1979         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1980         if (dev_tmp) {
1981                 if (pci_is_pcie(dev_tmp)) {
1982                         bus = dev_tmp->subordinate->number;
1983                         devfn = 0;
1984                 } else {
1985                         bus = dev_tmp->bus->number;
1986                         devfn = dev_tmp->devfn;
1987                 }
1988                 spin_lock_irqsave(&device_domain_lock, flags);
1989                 list_for_each_entry(info, &device_domain_list, global) {
1990                         if (info->segment == segment &&
1991                             info->bus == bus && info->devfn == devfn) {
1992                                 found = info->domain;
1993                                 break;
1994                         }
1995                 }
1996                 spin_unlock_irqrestore(&device_domain_lock, flags);
1997                 /* pcie-pci bridge already has a domain, uses it */
1998                 if (found) {
1999                         domain = found;
2000                         goto found_domain;
2001                 }
2002         }
2003
2004         domain = alloc_domain();
2005         if (!domain)
2006                 goto error;
2007
2008         /* Allocate new domain for the device */
2009         drhd = dmar_find_matched_drhd_unit(pdev);
2010         if (!drhd) {
2011                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2012                         pci_name(pdev));
2013                 free_domain_mem(domain);
2014                 return NULL;
2015         }
2016         iommu = drhd->iommu;
2017
2018         ret = iommu_attach_domain(domain, iommu);
2019         if (ret) {
2020                 free_domain_mem(domain);
2021                 goto error;
2022         }
2023
2024         if (domain_init(domain, gaw)) {
2025                 domain_exit(domain);
2026                 goto error;
2027         }
2028
2029         /* register pcie-to-pci device */
2030         if (dev_tmp) {
2031                 info = alloc_devinfo_mem();
2032                 if (!info) {
2033                         domain_exit(domain);
2034                         goto error;
2035                 }
2036                 info->segment = segment;
2037                 info->bus = bus;
2038                 info->devfn = devfn;
2039                 info->dev = NULL;
2040                 info->domain = domain;
2041                 /* This domain is shared by devices under p2p bridge */
2042                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2043
2044                 /* pcie-to-pci bridge already has a domain, uses it */
2045                 found = NULL;
2046                 spin_lock_irqsave(&device_domain_lock, flags);
2047                 list_for_each_entry(tmp, &device_domain_list, global) {
2048                         if (tmp->segment == segment &&
2049                             tmp->bus == bus && tmp->devfn == devfn) {
2050                                 found = tmp->domain;
2051                                 break;
2052                         }
2053                 }
2054                 if (found) {
2055                         spin_unlock_irqrestore(&device_domain_lock, flags);
2056                         free_devinfo_mem(info);
2057                         domain_exit(domain);
2058                         domain = found;
2059                 } else {
2060                         list_add(&info->link, &domain->devices);
2061                         list_add(&info->global, &device_domain_list);
2062                         spin_unlock_irqrestore(&device_domain_lock, flags);
2063                 }
2064         }
2065
2066 found_domain:
2067         info = alloc_devinfo_mem();
2068         if (!info)
2069                 goto error;
2070         info->segment = segment;
2071         info->bus = pdev->bus->number;
2072         info->devfn = pdev->devfn;
2073         info->dev = pdev;
2074         info->domain = domain;
2075         spin_lock_irqsave(&device_domain_lock, flags);
2076         /* somebody is fast */
2077         found = find_domain(pdev);
2078         if (found != NULL) {
2079                 spin_unlock_irqrestore(&device_domain_lock, flags);
2080                 if (found != domain) {
2081                         domain_exit(domain);
2082                         domain = found;
2083                 }
2084                 free_devinfo_mem(info);
2085                 return domain;
2086         }
2087         list_add(&info->link, &domain->devices);
2088         list_add(&info->global, &device_domain_list);
2089         pdev->dev.archdata.iommu = info;
2090         spin_unlock_irqrestore(&device_domain_lock, flags);
2091         return domain;
2092 error:
2093         /* recheck it here, maybe others set it */
2094         return find_domain(pdev);
2095 }
2096
2097 static int iommu_identity_mapping;
2098 #define IDENTMAP_ALL            1
2099 #define IDENTMAP_GFX            2
2100 #define IDENTMAP_AZALIA         4
2101
2102 static int iommu_domain_identity_map(struct dmar_domain *domain,
2103                                      unsigned long long start,
2104                                      unsigned long long end)
2105 {
2106         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2107         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2108
2109         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2110                           dma_to_mm_pfn(last_vpfn))) {
2111                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2112                 return -ENOMEM;
2113         }
2114
2115         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2116                  start, end, domain->id);
2117         /*
2118          * RMRR range might have overlap with physical memory range,
2119          * clear it first
2120          */
2121         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2122
2123         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2124                                   last_vpfn - first_vpfn + 1,
2125                                   DMA_PTE_READ|DMA_PTE_WRITE);
2126 }
2127
2128 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2129                                       unsigned long long start,
2130                                       unsigned long long end)
2131 {
2132         struct dmar_domain *domain;
2133         int ret;
2134
2135         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2136         if (!domain)
2137                 return -ENOMEM;
2138
2139         /* For _hardware_ passthrough, don't bother. But for software
2140            passthrough, we do it anyway -- it may indicate a memory
2141            range which is reserved in E820, so which didn't get set
2142            up to start with in si_domain */
2143         if (domain == si_domain && hw_pass_through) {
2144                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2145                        pci_name(pdev), start, end);
2146                 return 0;
2147         }
2148
2149         printk(KERN_INFO
2150                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2151                pci_name(pdev), start, end);
2152         
2153         if (end < start) {
2154                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2155                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156                         dmi_get_system_info(DMI_BIOS_VENDOR),
2157                         dmi_get_system_info(DMI_BIOS_VERSION),
2158                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2159                 ret = -EIO;
2160                 goto error;
2161         }
2162
2163         if (end >> agaw_to_width(domain->agaw)) {
2164                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2165                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2166                      agaw_to_width(domain->agaw),
2167                      dmi_get_system_info(DMI_BIOS_VENDOR),
2168                      dmi_get_system_info(DMI_BIOS_VERSION),
2169                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2170                 ret = -EIO;
2171                 goto error;
2172         }
2173
2174         ret = iommu_domain_identity_map(domain, start, end);
2175         if (ret)
2176                 goto error;
2177
2178         /* context entry init */
2179         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2180         if (ret)
2181                 goto error;
2182
2183         return 0;
2184
2185  error:
2186         domain_exit(domain);
2187         return ret;
2188 }
2189
2190 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2191         struct pci_dev *pdev)
2192 {
2193         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194                 return 0;
2195         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2196                 rmrr->end_address);
2197 }
2198
2199 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2200 static inline void iommu_prepare_isa(void)
2201 {
2202         struct pci_dev *pdev;
2203         int ret;
2204
2205         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2206         if (!pdev)
2207                 return;
2208
2209         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2210         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2211
2212         if (ret)
2213                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2214                        "floppy might not work\n");
2215
2216 }
2217 #else
2218 static inline void iommu_prepare_isa(void)
2219 {
2220         return;
2221 }
2222 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2223
2224 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2225
2226 static int __init si_domain_init(int hw)
2227 {
2228         struct dmar_drhd_unit *drhd;
2229         struct intel_iommu *iommu;
2230         int nid, ret = 0;
2231
2232         si_domain = alloc_domain();
2233         if (!si_domain)
2234                 return -EFAULT;
2235
2236         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2237
2238         for_each_active_iommu(iommu, drhd) {
2239                 ret = iommu_attach_domain(si_domain, iommu);
2240                 if (ret) {
2241                         domain_exit(si_domain);
2242                         return -EFAULT;
2243                 }
2244         }
2245
2246         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2247                 domain_exit(si_domain);
2248                 return -EFAULT;
2249         }
2250
2251         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2252
2253         if (hw)
2254                 return 0;
2255
2256         for_each_online_node(nid) {
2257                 unsigned long start_pfn, end_pfn;
2258                 int i;
2259
2260                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2261                         ret = iommu_domain_identity_map(si_domain,
2262                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2263                         if (ret)
2264                                 return ret;
2265                 }
2266         }
2267
2268         return 0;
2269 }
2270
2271 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2272                                           struct pci_dev *pdev);
2273 static int identity_mapping(struct pci_dev *pdev)
2274 {
2275         struct device_domain_info *info;
2276
2277         if (likely(!iommu_identity_mapping))
2278                 return 0;
2279
2280         info = pdev->dev.archdata.iommu;
2281         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2282                 return (info->domain == si_domain);
2283
2284         return 0;
2285 }
2286
2287 static int domain_add_dev_info(struct dmar_domain *domain,
2288                                struct pci_dev *pdev,
2289                                int translation)
2290 {
2291         struct device_domain_info *info;
2292         unsigned long flags;
2293         int ret;
2294
2295         info = alloc_devinfo_mem();
2296         if (!info)
2297                 return -ENOMEM;
2298
2299         info->segment = pci_domain_nr(pdev->bus);
2300         info->bus = pdev->bus->number;
2301         info->devfn = pdev->devfn;
2302         info->dev = pdev;
2303         info->domain = domain;
2304
2305         spin_lock_irqsave(&device_domain_lock, flags);
2306         list_add(&info->link, &domain->devices);
2307         list_add(&info->global, &device_domain_list);
2308         pdev->dev.archdata.iommu = info;
2309         spin_unlock_irqrestore(&device_domain_lock, flags);
2310
2311         ret = domain_context_mapping(domain, pdev, translation);
2312         if (ret) {
2313                 spin_lock_irqsave(&device_domain_lock, flags);
2314                 unlink_domain_info(info);
2315                 spin_unlock_irqrestore(&device_domain_lock, flags);
2316                 free_devinfo_mem(info);
2317                 return ret;
2318         }
2319
2320         return 0;
2321 }
2322
2323 static bool device_has_rmrr(struct pci_dev *dev)
2324 {
2325         struct dmar_rmrr_unit *rmrr;
2326         int i;
2327
2328         for_each_rmrr_units(rmrr) {
2329                 for (i = 0; i < rmrr->devices_cnt; i++) {
2330                         /*
2331                          * Return TRUE if this RMRR contains the device that
2332                          * is passed in.
2333                          */
2334                         if (rmrr->devices[i] == dev)
2335                                 return true;
2336                 }
2337         }
2338         return false;
2339 }
2340
2341 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2342 {
2343
2344         /*
2345          * We want to prevent any device associated with an RMRR from
2346          * getting placed into the SI Domain. This is done because
2347          * problems exist when devices are moved in and out of domains
2348          * and their respective RMRR info is lost. We exempt USB devices
2349          * from this process due to their usage of RMRRs that are known
2350          * to not be needed after BIOS hand-off to OS.
2351          */
2352         if (device_has_rmrr(pdev) &&
2353             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2354                 return 0;
2355
2356         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2357                 return 1;
2358
2359         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2360                 return 1;
2361
2362         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2363                 return 0;
2364
2365         /*
2366          * We want to start off with all devices in the 1:1 domain, and
2367          * take them out later if we find they can't access all of memory.
2368          *
2369          * However, we can't do this for PCI devices behind bridges,
2370          * because all PCI devices behind the same bridge will end up
2371          * with the same source-id on their transactions.
2372          *
2373          * Practically speaking, we can't change things around for these
2374          * devices at run-time, because we can't be sure there'll be no
2375          * DMA transactions in flight for any of their siblings.
2376          * 
2377          * So PCI devices (unless they're on the root bus) as well as
2378          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2379          * the 1:1 domain, just in _case_ one of their siblings turns out
2380          * not to be able to map all of memory.
2381          */
2382         if (!pci_is_pcie(pdev)) {
2383                 if (!pci_is_root_bus(pdev->bus))
2384                         return 0;
2385                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2386                         return 0;
2387         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2388                 return 0;
2389
2390         /* 
2391          * At boot time, we don't yet know if devices will be 64-bit capable.
2392          * Assume that they will -- if they turn out not to be, then we can 
2393          * take them out of the 1:1 domain later.
2394          */
2395         if (!startup) {
2396                 /*
2397                  * If the device's dma_mask is less than the system's memory
2398                  * size then this is not a candidate for identity mapping.
2399                  */
2400                 u64 dma_mask = pdev->dma_mask;
2401
2402                 if (pdev->dev.coherent_dma_mask &&
2403                     pdev->dev.coherent_dma_mask < dma_mask)
2404                         dma_mask = pdev->dev.coherent_dma_mask;
2405
2406                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2407         }
2408
2409         return 1;
2410 }
2411
2412 static int __init iommu_prepare_static_identity_mapping(int hw)
2413 {
2414         struct pci_dev *pdev = NULL;
2415         int ret;
2416
2417         ret = si_domain_init(hw);
2418         if (ret)
2419                 return -EFAULT;
2420
2421         for_each_pci_dev(pdev) {
2422                 if (iommu_should_identity_map(pdev, 1)) {
2423                         ret = domain_add_dev_info(si_domain, pdev,
2424                                              hw ? CONTEXT_TT_PASS_THROUGH :
2425                                                   CONTEXT_TT_MULTI_LEVEL);
2426                         if (ret) {
2427                                 /* device not associated with an iommu */
2428                                 if (ret == -ENODEV)
2429                                         continue;
2430                                 return ret;
2431                         }
2432                         pr_info("IOMMU: %s identity mapping for device %s\n",
2433                                 hw ? "hardware" : "software", pci_name(pdev));
2434                 }
2435         }
2436
2437         return 0;
2438 }
2439
2440 static int __init init_dmars(void)
2441 {
2442         struct dmar_drhd_unit *drhd;
2443         struct dmar_rmrr_unit *rmrr;
2444         struct pci_dev *pdev;
2445         struct intel_iommu *iommu;
2446         int i, ret;
2447
2448         /*
2449          * for each drhd
2450          *    allocate root
2451          *    initialize and program root entry to not present
2452          * endfor
2453          */
2454         for_each_drhd_unit(drhd) {
2455                 /*
2456                  * lock not needed as this is only incremented in the single
2457                  * threaded kernel __init code path all other access are read
2458                  * only
2459                  */
2460                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2461                         g_num_of_iommus++;
2462                         continue;
2463                 }
2464                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2465                           IOMMU_UNITS_SUPPORTED);
2466         }
2467
2468         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2469                         GFP_KERNEL);
2470         if (!g_iommus) {
2471                 printk(KERN_ERR "Allocating global iommu array failed\n");
2472                 ret = -ENOMEM;
2473                 goto error;
2474         }
2475
2476         deferred_flush = kzalloc(g_num_of_iommus *
2477                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2478         if (!deferred_flush) {
2479                 ret = -ENOMEM;
2480                 goto error;
2481         }
2482
2483         for_each_drhd_unit(drhd) {
2484                 if (drhd->ignored)
2485                         continue;
2486
2487                 iommu = drhd->iommu;
2488                 g_iommus[iommu->seq_id] = iommu;
2489
2490                 ret = iommu_init_domains(iommu);
2491                 if (ret)
2492                         goto error;
2493
2494                 /*
2495                  * TBD:
2496                  * we could share the same root & context tables
2497                  * among all IOMMU's. Need to Split it later.
2498                  */
2499                 ret = iommu_alloc_root_entry(iommu);
2500                 if (ret) {
2501                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2502                         goto error;
2503                 }
2504                 if (!ecap_pass_through(iommu->ecap))
2505                         hw_pass_through = 0;
2506         }
2507
2508         /*
2509          * Start from the sane iommu hardware state.
2510          */
2511         for_each_drhd_unit(drhd) {
2512                 if (drhd->ignored)
2513                         continue;
2514
2515                 iommu = drhd->iommu;
2516
2517                 /*
2518                  * If the queued invalidation is already initialized by us
2519                  * (for example, while enabling interrupt-remapping) then
2520                  * we got the things already rolling from a sane state.
2521                  */
2522                 if (iommu->qi)
2523                         continue;
2524
2525                 /*
2526                  * Clear any previous faults.
2527                  */
2528                 dmar_fault(-1, iommu);
2529                 /*
2530                  * Disable queued invalidation if supported and already enabled
2531                  * before OS handover.
2532                  */
2533                 dmar_disable_qi(iommu);
2534         }
2535
2536         for_each_drhd_unit(drhd) {
2537                 if (drhd->ignored)
2538                         continue;
2539
2540                 iommu = drhd->iommu;
2541
2542                 if (dmar_enable_qi(iommu)) {
2543                         /*
2544                          * Queued Invalidate not enabled, use Register Based
2545                          * Invalidate
2546                          */
2547                         iommu->flush.flush_context = __iommu_flush_context;
2548                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2549                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2550                                "invalidation\n",
2551                                 iommu->seq_id,
2552                                (unsigned long long)drhd->reg_base_addr);
2553                 } else {
2554                         iommu->flush.flush_context = qi_flush_context;
2555                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2556                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2557                                "invalidation\n",
2558                                 iommu->seq_id,
2559                                (unsigned long long)drhd->reg_base_addr);
2560                 }
2561         }
2562
2563         if (iommu_pass_through)
2564                 iommu_identity_mapping |= IDENTMAP_ALL;
2565
2566 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2567         iommu_identity_mapping |= IDENTMAP_GFX;
2568 #endif
2569
2570         check_tylersburg_isoch();
2571
2572         /*
2573          * If pass through is not set or not enabled, setup context entries for
2574          * identity mappings for rmrr, gfx, and isa and may fall back to static
2575          * identity mapping if iommu_identity_mapping is set.
2576          */
2577         if (iommu_identity_mapping) {
2578                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2579                 if (ret) {
2580                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2581                         goto error;
2582                 }
2583         }
2584         /*
2585          * For each rmrr
2586          *   for each dev attached to rmrr
2587          *   do
2588          *     locate drhd for dev, alloc domain for dev
2589          *     allocate free domain
2590          *     allocate page table entries for rmrr
2591          *     if context not allocated for bus
2592          *           allocate and init context
2593          *           set present in root table for this bus
2594          *     init context with domain, translation etc
2595          *    endfor
2596          * endfor
2597          */
2598         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2599         for_each_rmrr_units(rmrr) {
2600                 for (i = 0; i < rmrr->devices_cnt; i++) {
2601                         pdev = rmrr->devices[i];
2602                         /*
2603                          * some BIOS lists non-exist devices in DMAR
2604                          * table.
2605                          */
2606                         if (!pdev)
2607                                 continue;
2608                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2609                         if (ret)
2610                                 printk(KERN_ERR
2611                                        "IOMMU: mapping reserved region failed\n");
2612                 }
2613         }
2614
2615         iommu_prepare_isa();
2616
2617         /*
2618          * for each drhd
2619          *   enable fault log
2620          *   global invalidate context cache
2621          *   global invalidate iotlb
2622          *   enable translation
2623          */
2624         for_each_drhd_unit(drhd) {
2625                 if (drhd->ignored) {
2626                         /*
2627                          * we always have to disable PMRs or DMA may fail on
2628                          * this device
2629                          */
2630                         if (force_on)
2631                                 iommu_disable_protect_mem_regions(drhd->iommu);
2632                         continue;
2633                 }
2634                 iommu = drhd->iommu;
2635
2636                 iommu_flush_write_buffer(iommu);
2637
2638                 ret = dmar_set_interrupt(iommu);
2639                 if (ret)
2640                         goto error;
2641
2642                 iommu_set_root_entry(iommu);
2643
2644                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2645                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2646
2647                 ret = iommu_enable_translation(iommu);
2648                 if (ret)
2649                         goto error;
2650
2651                 iommu_disable_protect_mem_regions(iommu);
2652         }
2653
2654         return 0;
2655 error:
2656         for_each_drhd_unit(drhd) {
2657                 if (drhd->ignored)
2658                         continue;
2659                 iommu = drhd->iommu;
2660                 free_iommu(iommu);
2661         }
2662         kfree(g_iommus);
2663         return ret;
2664 }
2665
2666 /* This takes a number of _MM_ pages, not VTD pages */
2667 static struct iova *intel_alloc_iova(struct device *dev,
2668                                      struct dmar_domain *domain,
2669                                      unsigned long nrpages, uint64_t dma_mask)
2670 {
2671         struct pci_dev *pdev = to_pci_dev(dev);
2672         struct iova *iova = NULL;
2673
2674         /* Restrict dma_mask to the width that the iommu can handle */
2675         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2676
2677         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2678                 /*
2679                  * First try to allocate an io virtual address in
2680                  * DMA_BIT_MASK(32) and if that fails then try allocating
2681                  * from higher range
2682                  */
2683                 iova = alloc_iova(&domain->iovad, nrpages,
2684                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2685                 if (iova)
2686                         return iova;
2687         }
2688         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2689         if (unlikely(!iova)) {
2690                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2691                        nrpages, pci_name(pdev));
2692                 return NULL;
2693         }
2694
2695         return iova;
2696 }
2697
2698 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2699 {
2700         struct dmar_domain *domain;
2701         int ret;
2702
2703         domain = get_domain_for_dev(pdev,
2704                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2705         if (!domain) {
2706                 printk(KERN_ERR
2707                         "Allocating domain for %s failed", pci_name(pdev));
2708                 return NULL;
2709         }
2710
2711         /* make sure context mapping is ok */
2712         if (unlikely(!domain_context_mapped(pdev))) {
2713                 ret = domain_context_mapping(domain, pdev,
2714                                              CONTEXT_TT_MULTI_LEVEL);
2715                 if (ret) {
2716                         printk(KERN_ERR
2717                                 "Domain context map for %s failed",
2718                                 pci_name(pdev));
2719                         return NULL;
2720                 }
2721         }
2722
2723         return domain;
2724 }
2725
2726 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2727 {
2728         struct device_domain_info *info;
2729
2730         /* No lock here, assumes no domain exit in normal case */
2731         info = dev->dev.archdata.iommu;
2732         if (likely(info))
2733                 return info->domain;
2734
2735         return __get_valid_domain_for_dev(dev);
2736 }
2737
2738 static int iommu_dummy(struct pci_dev *pdev)
2739 {
2740         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2741 }
2742
2743 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2744 static int iommu_no_mapping(struct device *dev)
2745 {
2746         struct pci_dev *pdev;
2747         int found;
2748
2749         if (unlikely(dev->bus != &pci_bus_type))
2750                 return 1;
2751
2752         pdev = to_pci_dev(dev);
2753         if (iommu_dummy(pdev))
2754                 return 1;
2755
2756         if (!iommu_identity_mapping)
2757                 return 0;
2758
2759         found = identity_mapping(pdev);
2760         if (found) {
2761                 if (iommu_should_identity_map(pdev, 0))
2762                         return 1;
2763                 else {
2764                         /*
2765                          * 32 bit DMA is removed from si_domain and fall back
2766                          * to non-identity mapping.
2767                          */
2768                         domain_remove_one_dev_info(si_domain, pdev);
2769                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2770                                pci_name(pdev));
2771                         return 0;
2772                 }
2773         } else {
2774                 /*
2775                  * In case of a detached 64 bit DMA device from vm, the device
2776                  * is put into si_domain for identity mapping.
2777                  */
2778                 if (iommu_should_identity_map(pdev, 0)) {
2779                         int ret;
2780                         ret = domain_add_dev_info(si_domain, pdev,
2781                                                   hw_pass_through ?
2782                                                   CONTEXT_TT_PASS_THROUGH :
2783                                                   CONTEXT_TT_MULTI_LEVEL);
2784                         if (!ret) {
2785                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2786                                        pci_name(pdev));
2787                                 return 1;
2788                         }
2789                 }
2790         }
2791
2792         return 0;
2793 }
2794
2795 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2796                                      size_t size, int dir, u64 dma_mask)
2797 {
2798         struct pci_dev *pdev = to_pci_dev(hwdev);
2799         struct dmar_domain *domain;
2800         phys_addr_t start_paddr;
2801         struct iova *iova;
2802         int prot = 0;
2803         int ret;
2804         struct intel_iommu *iommu;
2805         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2806
2807         BUG_ON(dir == DMA_NONE);
2808
2809         if (iommu_no_mapping(hwdev))
2810                 return paddr;
2811
2812         domain = get_valid_domain_for_dev(pdev);
2813         if (!domain)
2814                 return 0;
2815
2816         iommu = domain_get_iommu(domain);
2817         size = aligned_nrpages(paddr, size);
2818
2819         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2820         if (!iova)
2821                 goto error;
2822
2823         /*
2824          * Check if DMAR supports zero-length reads on write only
2825          * mappings..
2826          */
2827         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2828                         !cap_zlr(iommu->cap))
2829                 prot |= DMA_PTE_READ;
2830         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2831                 prot |= DMA_PTE_WRITE;
2832         /*
2833          * paddr - (paddr + size) might be partial page, we should map the whole
2834          * page.  Note: if two part of one page are separately mapped, we
2835          * might have two guest_addr mapping to the same host paddr, but this
2836          * is not a big problem
2837          */
2838         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2839                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2840         if (ret)
2841                 goto error;
2842
2843         /* it's a non-present to present mapping. Only flush if caching mode */
2844         if (cap_caching_mode(iommu->cap))
2845                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2846         else
2847                 iommu_flush_write_buffer(iommu);
2848
2849         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2850         start_paddr += paddr & ~PAGE_MASK;
2851         return start_paddr;
2852
2853 error:
2854         if (iova)
2855                 __free_iova(&domain->iovad, iova);
2856         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2857                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2858         return 0;
2859 }
2860
2861 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2862                                  unsigned long offset, size_t size,
2863                                  enum dma_data_direction dir,
2864                                  struct dma_attrs *attrs)
2865 {
2866         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2867                                   dir, to_pci_dev(dev)->dma_mask);
2868 }
2869
2870 static void flush_unmaps(void)
2871 {
2872         int i, j;
2873
2874         timer_on = 0;
2875
2876         /* just flush them all */
2877         for (i = 0; i < g_num_of_iommus; i++) {
2878                 struct intel_iommu *iommu = g_iommus[i];
2879                 if (!iommu)
2880                         continue;
2881
2882                 if (!deferred_flush[i].next)
2883                         continue;
2884
2885                 /* In caching mode, global flushes turn emulation expensive */
2886                 if (!cap_caching_mode(iommu->cap))
2887                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2888                                          DMA_TLB_GLOBAL_FLUSH);
2889                 for (j = 0; j < deferred_flush[i].next; j++) {
2890                         unsigned long mask;
2891                         struct iova *iova = deferred_flush[i].iova[j];
2892                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2893
2894                         /* On real hardware multiple invalidations are expensive */
2895                         if (cap_caching_mode(iommu->cap))
2896                                 iommu_flush_iotlb_psi(iommu, domain->id,
2897                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2898                         else {
2899                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2900                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2901                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2902                         }
2903                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2904                 }
2905                 deferred_flush[i].next = 0;
2906         }
2907
2908         list_size = 0;
2909 }
2910
2911 static void flush_unmaps_timeout(unsigned long data)
2912 {
2913         unsigned long flags;
2914
2915         spin_lock_irqsave(&async_umap_flush_lock, flags);
2916         flush_unmaps();
2917         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2918 }
2919
2920 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2921 {
2922         unsigned long flags;
2923         int next, iommu_id;
2924         struct intel_iommu *iommu;
2925
2926         spin_lock_irqsave(&async_umap_flush_lock, flags);
2927         if (list_size == HIGH_WATER_MARK)
2928                 flush_unmaps();
2929
2930         iommu = domain_get_iommu(dom);
2931         iommu_id = iommu->seq_id;
2932
2933         next = deferred_flush[iommu_id].next;
2934         deferred_flush[iommu_id].domain[next] = dom;
2935         deferred_flush[iommu_id].iova[next] = iova;
2936         deferred_flush[iommu_id].next++;
2937
2938         if (!timer_on) {
2939                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2940                 timer_on = 1;
2941         }
2942         list_size++;
2943         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2944 }
2945
2946 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2947                              size_t size, enum dma_data_direction dir,
2948                              struct dma_attrs *attrs)
2949 {
2950         struct pci_dev *pdev = to_pci_dev(dev);
2951         struct dmar_domain *domain;
2952         unsigned long start_pfn, last_pfn;
2953         struct iova *iova;
2954         struct intel_iommu *iommu;
2955
2956         if (iommu_no_mapping(dev))
2957                 return;
2958
2959         domain = find_domain(pdev);
2960         BUG_ON(!domain);
2961
2962         iommu = domain_get_iommu(domain);
2963
2964         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2965         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2966                       (unsigned long long)dev_addr))
2967                 return;
2968
2969         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2970         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2971
2972         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2973                  pci_name(pdev), start_pfn, last_pfn);
2974
2975         /*  clear the whole page */
2976         dma_pte_clear_range(domain, start_pfn, last_pfn);
2977
2978         /* free page tables */
2979         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2980
2981         if (intel_iommu_strict) {
2982                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2983                                       last_pfn - start_pfn + 1, 0);
2984                 /* free iova */
2985                 __free_iova(&domain->iovad, iova);
2986         } else {
2987                 add_unmap(domain, iova);
2988                 /*
2989                  * queue up the release of the unmap to save the 1/6th of the
2990                  * cpu used up by the iotlb flush operation...
2991                  */
2992         }
2993 }
2994
2995 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2996                                   dma_addr_t *dma_handle, gfp_t flags,
2997                                   struct dma_attrs *attrs)
2998 {
2999         void *vaddr;
3000         int order;
3001
3002         size = PAGE_ALIGN(size);
3003         order = get_order(size);
3004
3005         if (!iommu_no_mapping(hwdev))
3006                 flags &= ~(GFP_DMA | GFP_DMA32);
3007         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3008                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3009                         flags |= GFP_DMA;
3010                 else
3011                         flags |= GFP_DMA32;
3012         }
3013
3014         vaddr = (void *)__get_free_pages(flags, order);
3015         if (!vaddr)
3016                 return NULL;
3017         memset(vaddr, 0, size);
3018
3019         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3020                                          DMA_BIDIRECTIONAL,
3021                                          hwdev->coherent_dma_mask);
3022         if (*dma_handle)
3023                 return vaddr;
3024         free_pages((unsigned long)vaddr, order);
3025         return NULL;
3026 }
3027
3028 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3029                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3030 {
3031         int order;
3032
3033         size = PAGE_ALIGN(size);
3034         order = get_order(size);
3035
3036         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3037         free_pages((unsigned long)vaddr, order);
3038 }
3039
3040 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3041                            int nelems, enum dma_data_direction dir,
3042                            struct dma_attrs *attrs)
3043 {
3044         struct pci_dev *pdev = to_pci_dev(hwdev);
3045         struct dmar_domain *domain;
3046         unsigned long start_pfn, last_pfn;
3047         struct iova *iova;
3048         struct intel_iommu *iommu;
3049
3050         if (iommu_no_mapping(hwdev))
3051                 return;
3052
3053         domain = find_domain(pdev);
3054         BUG_ON(!domain);
3055
3056         iommu = domain_get_iommu(domain);
3057
3058         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3059         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3060                       (unsigned long long)sglist[0].dma_address))
3061                 return;
3062
3063         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3064         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3065
3066         /*  clear the whole page */
3067         dma_pte_clear_range(domain, start_pfn, last_pfn);
3068
3069         /* free page tables */
3070         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3071
3072         if (intel_iommu_strict) {
3073                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3074                                       last_pfn - start_pfn + 1, 0);
3075                 /* free iova */
3076                 __free_iova(&domain->iovad, iova);
3077         } else {
3078                 add_unmap(domain, iova);
3079                 /*
3080                  * queue up the release of the unmap to save the 1/6th of the
3081                  * cpu used up by the iotlb flush operation...
3082                  */
3083         }
3084 }
3085
3086 static int intel_nontranslate_map_sg(struct device *hddev,
3087         struct scatterlist *sglist, int nelems, int dir)
3088 {
3089         int i;
3090         struct scatterlist *sg;
3091
3092         for_each_sg(sglist, sg, nelems, i) {
3093                 BUG_ON(!sg_page(sg));
3094                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3095                 sg->dma_length = sg->length;
3096         }
3097         return nelems;
3098 }
3099
3100 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3101                         enum dma_data_direction dir, struct dma_attrs *attrs)
3102 {
3103         int i;
3104         struct pci_dev *pdev = to_pci_dev(hwdev);
3105         struct dmar_domain *domain;
3106         size_t size = 0;
3107         int prot = 0;
3108         struct iova *iova = NULL;
3109         int ret;
3110         struct scatterlist *sg;
3111         unsigned long start_vpfn;
3112         struct intel_iommu *iommu;
3113
3114         BUG_ON(dir == DMA_NONE);
3115         if (iommu_no_mapping(hwdev))
3116                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3117
3118         domain = get_valid_domain_for_dev(pdev);
3119         if (!domain)
3120                 return 0;
3121
3122         iommu = domain_get_iommu(domain);
3123
3124         for_each_sg(sglist, sg, nelems, i)
3125                 size += aligned_nrpages(sg->offset, sg->length);
3126
3127         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3128                                 pdev->dma_mask);
3129         if (!iova) {
3130                 sglist->dma_length = 0;
3131                 return 0;
3132         }
3133
3134         /*
3135          * Check if DMAR supports zero-length reads on write only
3136          * mappings..
3137          */
3138         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3139                         !cap_zlr(iommu->cap))
3140                 prot |= DMA_PTE_READ;
3141         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3142                 prot |= DMA_PTE_WRITE;
3143
3144         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3145
3146         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3147         if (unlikely(ret)) {
3148                 /*  clear the page */
3149                 dma_pte_clear_range(domain, start_vpfn,
3150                                     start_vpfn + size - 1);
3151                 /* free page tables */
3152                 dma_pte_free_pagetable(domain, start_vpfn,
3153                                        start_vpfn + size - 1);
3154                 /* free iova */
3155                 __free_iova(&domain->iovad, iova);
3156                 return 0;
3157         }
3158
3159         /* it's a non-present to present mapping. Only flush if caching mode */
3160         if (cap_caching_mode(iommu->cap))
3161                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3162         else
3163                 iommu_flush_write_buffer(iommu);
3164
3165         return nelems;
3166 }
3167
3168 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3169 {
3170         return !dma_addr;
3171 }
3172
3173 struct dma_map_ops intel_dma_ops = {
3174         .alloc = intel_alloc_coherent,
3175         .free = intel_free_coherent,
3176         .map_sg = intel_map_sg,
3177         .unmap_sg = intel_unmap_sg,
3178         .map_page = intel_map_page,
3179         .unmap_page = intel_unmap_page,
3180         .mapping_error = intel_mapping_error,
3181 };
3182
3183 static inline int iommu_domain_cache_init(void)
3184 {
3185         int ret = 0;
3186
3187         iommu_domain_cache = kmem_cache_create("iommu_domain",
3188                                          sizeof(struct dmar_domain),
3189                                          0,
3190                                          SLAB_HWCACHE_ALIGN,
3191
3192                                          NULL);
3193         if (!iommu_domain_cache) {
3194                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3195                 ret = -ENOMEM;
3196         }
3197
3198         return ret;
3199 }
3200
3201 static inline int iommu_devinfo_cache_init(void)
3202 {
3203         int ret = 0;
3204
3205         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3206                                          sizeof(struct device_domain_info),
3207                                          0,
3208                                          SLAB_HWCACHE_ALIGN,
3209                                          NULL);
3210         if (!iommu_devinfo_cache) {
3211                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3212                 ret = -ENOMEM;
3213         }
3214
3215         return ret;
3216 }
3217
3218 static inline int iommu_iova_cache_init(void)
3219 {
3220         int ret = 0;
3221
3222         iommu_iova_cache = kmem_cache_create("iommu_iova",
3223                                          sizeof(struct iova),
3224                                          0,
3225                                          SLAB_HWCACHE_ALIGN,
3226                                          NULL);
3227         if (!iommu_iova_cache) {
3228                 printk(KERN_ERR "Couldn't create iova cache\n");
3229                 ret = -ENOMEM;
3230         }
3231
3232         return ret;
3233 }
3234
3235 static int __init iommu_init_mempool(void)
3236 {
3237         int ret;
3238         ret = iommu_iova_cache_init();
3239         if (ret)
3240                 return ret;
3241
3242         ret = iommu_domain_cache_init();
3243         if (ret)
3244                 goto domain_error;
3245
3246         ret = iommu_devinfo_cache_init();
3247         if (!ret)
3248                 return ret;
3249
3250         kmem_cache_destroy(iommu_domain_cache);
3251 domain_error:
3252         kmem_cache_destroy(iommu_iova_cache);
3253
3254         return -ENOMEM;
3255 }
3256
3257 static void __init iommu_exit_mempool(void)
3258 {
3259         kmem_cache_destroy(iommu_devinfo_cache);
3260         kmem_cache_destroy(iommu_domain_cache);
3261         kmem_cache_destroy(iommu_iova_cache);
3262
3263 }
3264
3265 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3266 {
3267         struct dmar_drhd_unit *drhd;
3268         u32 vtbar;
3269         int rc;
3270
3271         /* We know that this device on this chipset has its own IOMMU.
3272          * If we find it under a different IOMMU, then the BIOS is lying
3273          * to us. Hope that the IOMMU for this device is actually
3274          * disabled, and it needs no translation...
3275          */
3276         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3277         if (rc) {
3278                 /* "can't" happen */
3279                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3280                 return;
3281         }
3282         vtbar &= 0xffff0000;
3283
3284         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3285         drhd = dmar_find_matched_drhd_unit(pdev);
3286         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3287                             TAINT_FIRMWARE_WORKAROUND,
3288                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3289                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3290 }
3291 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3292
3293 static void __init init_no_remapping_devices(void)
3294 {
3295         struct dmar_drhd_unit *drhd;
3296
3297         for_each_drhd_unit(drhd) {
3298                 if (!drhd->include_all) {
3299                         int i;
3300                         for (i = 0; i < drhd->devices_cnt; i++)
3301                                 if (drhd->devices[i] != NULL)
3302                                         break;
3303                         /* ignore DMAR unit if no pci devices exist */
3304                         if (i == drhd->devices_cnt)
3305                                 drhd->ignored = 1;
3306                 }
3307         }
3308
3309         for_each_drhd_unit(drhd) {
3310                 int i;
3311                 if (drhd->ignored || drhd->include_all)
3312                         continue;
3313
3314                 for (i = 0; i < drhd->devices_cnt; i++)
3315                         if (drhd->devices[i] &&
3316                             !IS_GFX_DEVICE(drhd->devices[i]))
3317                                 break;
3318
3319                 if (i < drhd->devices_cnt)
3320                         continue;
3321
3322                 /* This IOMMU has *only* gfx devices. Either bypass it or
3323                    set the gfx_mapped flag, as appropriate */
3324                 if (dmar_map_gfx) {
3325                         intel_iommu_gfx_mapped = 1;
3326                 } else {
3327                         drhd->ignored = 1;
3328                         for (i = 0; i < drhd->devices_cnt; i++) {
3329                                 if (!drhd->devices[i])
3330                                         continue;
3331                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3332                         }
3333                 }
3334         }
3335 }
3336
3337 #ifdef CONFIG_SUSPEND
3338 static int init_iommu_hw(void)
3339 {
3340         struct dmar_drhd_unit *drhd;
3341         struct intel_iommu *iommu = NULL;
3342
3343         for_each_active_iommu(iommu, drhd)
3344                 if (iommu->qi)
3345                         dmar_reenable_qi(iommu);
3346
3347         for_each_iommu(iommu, drhd) {
3348                 if (drhd->ignored) {
3349                         /*
3350                          * we always have to disable PMRs or DMA may fail on
3351                          * this device
3352                          */
3353                         if (force_on)
3354                                 iommu_disable_protect_mem_regions(iommu);
3355                         continue;
3356                 }
3357         
3358                 iommu_flush_write_buffer(iommu);
3359
3360                 iommu_set_root_entry(iommu);
3361
3362                 iommu->flush.flush_context(iommu, 0, 0, 0,
3363                                            DMA_CCMD_GLOBAL_INVL);
3364                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3365                                          DMA_TLB_GLOBAL_FLUSH);
3366                 if (iommu_enable_translation(iommu))
3367                         return 1;
3368                 iommu_disable_protect_mem_regions(iommu);
3369         }
3370
3371         return 0;
3372 }
3373
3374 static void iommu_flush_all(void)
3375 {
3376         struct dmar_drhd_unit *drhd;
3377         struct intel_iommu *iommu;
3378
3379         for_each_active_iommu(iommu, drhd) {
3380                 iommu->flush.flush_context(iommu, 0, 0, 0,
3381                                            DMA_CCMD_GLOBAL_INVL);
3382                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3383                                          DMA_TLB_GLOBAL_FLUSH);
3384         }
3385 }
3386
3387 static int iommu_suspend(void)
3388 {
3389         struct dmar_drhd_unit *drhd;
3390         struct intel_iommu *iommu = NULL;
3391         unsigned long flag;
3392
3393         for_each_active_iommu(iommu, drhd) {
3394                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3395                                                  GFP_ATOMIC);
3396                 if (!iommu->iommu_state)
3397                         goto nomem;
3398         }
3399
3400         iommu_flush_all();
3401
3402         for_each_active_iommu(iommu, drhd) {
3403                 iommu_disable_translation(iommu);
3404
3405                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3406
3407                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3408                         readl(iommu->reg + DMAR_FECTL_REG);
3409                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3410                         readl(iommu->reg + DMAR_FEDATA_REG);
3411                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3412                         readl(iommu->reg + DMAR_FEADDR_REG);
3413                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3414                         readl(iommu->reg + DMAR_FEUADDR_REG);
3415
3416                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3417         }
3418         return 0;
3419
3420 nomem:
3421         for_each_active_iommu(iommu, drhd)
3422                 kfree(iommu->iommu_state);
3423
3424         return -ENOMEM;
3425 }
3426
3427 static void iommu_resume(void)
3428 {
3429         struct dmar_drhd_unit *drhd;
3430         struct intel_iommu *iommu = NULL;
3431         unsigned long flag;
3432
3433         if (init_iommu_hw()) {
3434                 if (force_on)
3435                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3436                 else
3437                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3438                 return;
3439         }
3440
3441         for_each_active_iommu(iommu, drhd) {
3442
3443                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3444
3445                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3446                         iommu->reg + DMAR_FECTL_REG);
3447                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3448                         iommu->reg + DMAR_FEDATA_REG);
3449                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3450                         iommu->reg + DMAR_FEADDR_REG);
3451                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3452                         iommu->reg + DMAR_FEUADDR_REG);
3453
3454                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3455         }
3456
3457         for_each_active_iommu(iommu, drhd)
3458                 kfree(iommu->iommu_state);
3459 }
3460
3461 static struct syscore_ops iommu_syscore_ops = {
3462         .resume         = iommu_resume,
3463         .suspend        = iommu_suspend,
3464 };
3465
3466 static void __init init_iommu_pm_ops(void)
3467 {
3468         register_syscore_ops(&iommu_syscore_ops);
3469 }
3470
3471 #else
3472 static inline void init_iommu_pm_ops(void) {}
3473 #endif  /* CONFIG_PM */
3474
3475 LIST_HEAD(dmar_rmrr_units);
3476
3477 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3478 {
3479         list_add(&rmrr->list, &dmar_rmrr_units);
3480 }
3481
3482
3483 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3484 {
3485         struct acpi_dmar_reserved_memory *rmrr;
3486         struct dmar_rmrr_unit *rmrru;
3487
3488         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3489         if (!rmrru)
3490                 return -ENOMEM;
3491
3492         rmrru->hdr = header;
3493         rmrr = (struct acpi_dmar_reserved_memory *)header;
3494         rmrru->base_address = rmrr->base_address;
3495         rmrru->end_address = rmrr->end_address;
3496
3497         dmar_register_rmrr_unit(rmrru);
3498         return 0;
3499 }
3500
3501 static int __init
3502 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3503 {
3504         struct acpi_dmar_reserved_memory *rmrr;
3505         int ret;
3506
3507         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3508         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3509                 ((void *)rmrr) + rmrr->header.length,
3510                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3511
3512         if (ret || (rmrru->devices_cnt == 0)) {
3513                 list_del(&rmrru->list);
3514                 kfree(rmrru);
3515         }
3516         return ret;
3517 }
3518
3519 static LIST_HEAD(dmar_atsr_units);
3520
3521 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3522 {
3523         struct acpi_dmar_atsr *atsr;
3524         struct dmar_atsr_unit *atsru;
3525
3526         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3527         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3528         if (!atsru)
3529                 return -ENOMEM;
3530
3531         atsru->hdr = hdr;
3532         atsru->include_all = atsr->flags & 0x1;
3533
3534         list_add(&atsru->list, &dmar_atsr_units);
3535
3536         return 0;
3537 }
3538
3539 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3540 {
3541         int rc;
3542         struct acpi_dmar_atsr *atsr;
3543
3544         if (atsru->include_all)
3545                 return 0;
3546
3547         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3548         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3549                                 (void *)atsr + atsr->header.length,
3550                                 &atsru->devices_cnt, &atsru->devices,
3551                                 atsr->segment);
3552         if (rc || !atsru->devices_cnt) {
3553                 list_del(&atsru->list);
3554                 kfree(atsru);
3555         }
3556
3557         return rc;
3558 }
3559
3560 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3561 {
3562         int i;
3563         struct pci_bus *bus;
3564         struct acpi_dmar_atsr *atsr;
3565         struct dmar_atsr_unit *atsru;
3566
3567         dev = pci_physfn(dev);
3568
3569         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3570                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3571                 if (atsr->segment == pci_domain_nr(dev->bus))
3572                         goto found;
3573         }
3574
3575         return 0;
3576
3577 found:
3578         for (bus = dev->bus; bus; bus = bus->parent) {
3579                 struct pci_dev *bridge = bus->self;
3580
3581                 if (!bridge || !pci_is_pcie(bridge) ||
3582                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3583                         return 0;
3584
3585                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3586                         for (i = 0; i < atsru->devices_cnt; i++)
3587                                 if (atsru->devices[i] == bridge)
3588                                         return 1;
3589                         break;
3590                 }
3591         }
3592
3593         if (atsru->include_all)
3594                 return 1;
3595
3596         return 0;
3597 }
3598
3599 int __init dmar_parse_rmrr_atsr_dev(void)
3600 {
3601         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3602         struct dmar_atsr_unit *atsr, *atsr_n;
3603         int ret = 0;
3604
3605         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3606                 ret = rmrr_parse_dev(rmrr);
3607                 if (ret)
3608                         return ret;
3609         }
3610
3611         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3612                 ret = atsr_parse_dev(atsr);
3613                 if (ret)
3614                         return ret;
3615         }
3616
3617         return ret;
3618 }
3619
3620 /*
3621  * Here we only respond to action of unbound device from driver.
3622  *
3623  * Added device is not attached to its DMAR domain here yet. That will happen
3624  * when mapping the device to iova.
3625  */
3626 static int device_notifier(struct notifier_block *nb,
3627                                   unsigned long action, void *data)
3628 {
3629         struct device *dev = data;
3630         struct pci_dev *pdev = to_pci_dev(dev);
3631         struct dmar_domain *domain;
3632
3633         if (iommu_no_mapping(dev))
3634                 return 0;
3635
3636         domain = find_domain(pdev);
3637         if (!domain)
3638                 return 0;
3639
3640         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3641                 domain_remove_one_dev_info(domain, pdev);
3642
3643                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3644                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3645                     list_empty(&domain->devices))
3646                         domain_exit(domain);
3647         }
3648
3649         return 0;
3650 }
3651
3652 static struct notifier_block device_nb = {
3653         .notifier_call = device_notifier,
3654 };
3655
3656 int __init intel_iommu_init(void)
3657 {
3658         int ret = 0;
3659
3660         /* VT-d is required for a TXT/tboot launch, so enforce that */
3661         force_on = tboot_force_iommu();
3662
3663         if (dmar_table_init()) {
3664                 if (force_on)
3665                         panic("tboot: Failed to initialize DMAR table\n");
3666                 return  -ENODEV;
3667         }
3668
3669         if (dmar_dev_scope_init() < 0) {
3670                 if (force_on)
3671                         panic("tboot: Failed to initialize DMAR device scope\n");
3672                 return  -ENODEV;
3673         }
3674
3675         if (no_iommu || dmar_disabled)
3676                 return -ENODEV;
3677
3678         if (iommu_init_mempool()) {
3679                 if (force_on)
3680                         panic("tboot: Failed to initialize iommu memory\n");
3681                 return  -ENODEV;
3682         }
3683
3684         if (list_empty(&dmar_rmrr_units))
3685                 printk(KERN_INFO "DMAR: No RMRR found\n");
3686
3687         if (list_empty(&dmar_atsr_units))
3688                 printk(KERN_INFO "DMAR: No ATSR found\n");
3689
3690         if (dmar_init_reserved_ranges()) {
3691                 if (force_on)
3692                         panic("tboot: Failed to reserve iommu ranges\n");
3693                 return  -ENODEV;
3694         }
3695
3696         init_no_remapping_devices();
3697
3698         ret = init_dmars();
3699         if (ret) {
3700                 if (force_on)
3701                         panic("tboot: Failed to initialize DMARs\n");
3702                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3703                 put_iova_domain(&reserved_iova_list);
3704                 iommu_exit_mempool();
3705                 return ret;
3706         }
3707         printk(KERN_INFO
3708         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3709
3710         init_timer(&unmap_timer);
3711 #ifdef CONFIG_SWIOTLB
3712         swiotlb = 0;
3713 #endif
3714         dma_ops = &intel_dma_ops;
3715
3716         init_iommu_pm_ops();
3717
3718         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3719
3720         bus_register_notifier(&pci_bus_type, &device_nb);
3721
3722         intel_iommu_enabled = 1;
3723
3724         return 0;
3725 }
3726
3727 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3728                                            struct pci_dev *pdev)
3729 {
3730         struct pci_dev *tmp, *parent;
3731
3732         if (!iommu || !pdev)
3733                 return;
3734
3735         /* dependent device detach */
3736         tmp = pci_find_upstream_pcie_bridge(pdev);
3737         /* Secondary interface's bus number and devfn 0 */
3738         if (tmp) {
3739                 parent = pdev->bus->self;
3740                 while (parent != tmp) {
3741                         iommu_detach_dev(iommu, parent->bus->number,
3742                                          parent->devfn);
3743                         parent = parent->bus->self;
3744                 }
3745                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3746                         iommu_detach_dev(iommu,
3747                                 tmp->subordinate->number, 0);
3748                 else /* this is a legacy PCI bridge */
3749                         iommu_detach_dev(iommu, tmp->bus->number,
3750                                          tmp->devfn);
3751         }
3752 }
3753
3754 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3755                                           struct pci_dev *pdev)
3756 {
3757         struct device_domain_info *info;
3758         struct intel_iommu *iommu;
3759         unsigned long flags;
3760         int found = 0;
3761         struct list_head *entry, *tmp;
3762
3763         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3764                                 pdev->devfn);
3765         if (!iommu)
3766                 return;
3767
3768         spin_lock_irqsave(&device_domain_lock, flags);
3769         list_for_each_safe(entry, tmp, &domain->devices) {
3770                 info = list_entry(entry, struct device_domain_info, link);
3771                 if (info->segment == pci_domain_nr(pdev->bus) &&
3772                     info->bus == pdev->bus->number &&
3773                     info->devfn == pdev->devfn) {
3774                         unlink_domain_info(info);
3775                         spin_unlock_irqrestore(&device_domain_lock, flags);
3776
3777                         iommu_disable_dev_iotlb(info);
3778                         iommu_detach_dev(iommu, info->bus, info->devfn);
3779                         iommu_detach_dependent_devices(iommu, pdev);
3780                         free_devinfo_mem(info);
3781
3782                         spin_lock_irqsave(&device_domain_lock, flags);
3783
3784                         if (found)
3785                                 break;
3786                         else
3787                                 continue;
3788                 }
3789
3790                 /* if there is no other devices under the same iommu
3791                  * owned by this domain, clear this iommu in iommu_bmp
3792                  * update iommu count and coherency
3793                  */
3794                 if (iommu == device_to_iommu(info->segment, info->bus,
3795                                             info->devfn))
3796                         found = 1;
3797         }
3798
3799         spin_unlock_irqrestore(&device_domain_lock, flags);
3800
3801         if (found == 0) {
3802                 unsigned long tmp_flags;
3803                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3804                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3805                 domain->iommu_count--;
3806                 domain_update_iommu_cap(domain);
3807                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3808
3809                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3810                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3811                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3812                         clear_bit(domain->id, iommu->domain_ids);
3813                         iommu->domains[domain->id] = NULL;
3814                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3815                 }
3816         }
3817 }
3818
3819 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3820 {
3821         struct device_domain_info *info;
3822         struct intel_iommu *iommu;
3823         unsigned long flags1, flags2;
3824
3825         spin_lock_irqsave(&device_domain_lock, flags1);
3826         while (!list_empty(&domain->devices)) {
3827                 info = list_entry(domain->devices.next,
3828                         struct device_domain_info, link);
3829                 unlink_domain_info(info);
3830                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3831
3832                 iommu_disable_dev_iotlb(info);
3833                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3834                 iommu_detach_dev(iommu, info->bus, info->devfn);
3835                 iommu_detach_dependent_devices(iommu, info->dev);
3836
3837                 /* clear this iommu in iommu_bmp, update iommu count
3838                  * and capabilities
3839                  */
3840                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3841                 if (test_and_clear_bit(iommu->seq_id,
3842                                        domain->iommu_bmp)) {
3843                         domain->iommu_count--;
3844                         domain_update_iommu_cap(domain);
3845                 }
3846                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3847
3848                 free_devinfo_mem(info);
3849                 spin_lock_irqsave(&device_domain_lock, flags1);
3850         }
3851         spin_unlock_irqrestore(&device_domain_lock, flags1);
3852 }
3853
3854 /* domain id for virtual machine, it won't be set in context */
3855 static unsigned long vm_domid;
3856
3857 static struct dmar_domain *iommu_alloc_vm_domain(void)
3858 {
3859         struct dmar_domain *domain;
3860
3861         domain = alloc_domain_mem();
3862         if (!domain)
3863                 return NULL;
3864
3865         domain->id = vm_domid++;
3866         domain->nid = -1;
3867         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3868         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3869
3870         return domain;
3871 }
3872
3873 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3874 {
3875         int adjust_width;
3876
3877         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3878         spin_lock_init(&domain->iommu_lock);
3879
3880         domain_reserve_special_ranges(domain);
3881
3882         /* calculate AGAW */
3883         domain->gaw = guest_width;
3884         adjust_width = guestwidth_to_adjustwidth(guest_width);
3885         domain->agaw = width_to_agaw(adjust_width);
3886
3887         INIT_LIST_HEAD(&domain->devices);
3888
3889         domain->iommu_count = 0;
3890         domain->iommu_coherency = 0;
3891         domain->iommu_snooping = 0;
3892         domain->iommu_superpage = 0;
3893         domain->max_addr = 0;
3894         domain->nid = -1;
3895
3896         /* always allocate the top pgd */
3897         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3898         if (!domain->pgd)
3899                 return -ENOMEM;
3900         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3901         return 0;
3902 }
3903
3904 static void iommu_free_vm_domain(struct dmar_domain *domain)
3905 {
3906         unsigned long flags;
3907         struct dmar_drhd_unit *drhd;
3908         struct intel_iommu *iommu;
3909         unsigned long i;
3910         unsigned long ndomains;
3911
3912         for_each_drhd_unit(drhd) {
3913                 if (drhd->ignored)
3914                         continue;
3915                 iommu = drhd->iommu;
3916
3917                 ndomains = cap_ndoms(iommu->cap);
3918                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3919                         if (iommu->domains[i] == domain) {
3920                                 spin_lock_irqsave(&iommu->lock, flags);
3921                                 clear_bit(i, iommu->domain_ids);
3922                                 iommu->domains[i] = NULL;
3923                                 spin_unlock_irqrestore(&iommu->lock, flags);
3924                                 break;
3925                         }
3926                 }
3927         }
3928 }
3929
3930 static void vm_domain_exit(struct dmar_domain *domain)
3931 {
3932         /* Domain 0 is reserved, so dont process it */
3933         if (!domain)
3934                 return;
3935
3936         vm_domain_remove_all_dev_info(domain);
3937         /* destroy iovas */
3938         put_iova_domain(&domain->iovad);
3939
3940         /* clear ptes */
3941         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3942
3943         /* free page tables */
3944         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3945
3946         iommu_free_vm_domain(domain);
3947         free_domain_mem(domain);
3948 }
3949
3950 static int intel_iommu_domain_init(struct iommu_domain *domain)
3951 {
3952         struct dmar_domain *dmar_domain;
3953
3954         dmar_domain = iommu_alloc_vm_domain();
3955         if (!dmar_domain) {
3956                 printk(KERN_ERR
3957                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3958                 return -ENOMEM;
3959         }
3960         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3961                 printk(KERN_ERR
3962                         "intel_iommu_domain_init() failed\n");
3963                 vm_domain_exit(dmar_domain);
3964                 return -ENOMEM;
3965         }
3966         domain_update_iommu_cap(dmar_domain);
3967         domain->priv = dmar_domain;
3968
3969         domain->geometry.aperture_start = 0;
3970         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3971         domain->geometry.force_aperture = true;
3972
3973         return 0;
3974 }
3975
3976 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3977 {
3978         struct dmar_domain *dmar_domain = domain->priv;
3979
3980         domain->priv = NULL;
3981         vm_domain_exit(dmar_domain);
3982 }
3983
3984 static int intel_iommu_attach_device(struct iommu_domain *domain,
3985                                      struct device *dev)
3986 {
3987         struct dmar_domain *dmar_domain = domain->priv;
3988         struct pci_dev *pdev = to_pci_dev(dev);
3989         struct intel_iommu *iommu;
3990         int addr_width;
3991
3992         /* normally pdev is not mapped */
3993         if (unlikely(domain_context_mapped(pdev))) {
3994                 struct dmar_domain *old_domain;
3995
3996                 old_domain = find_domain(pdev);
3997                 if (old_domain) {
3998                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3999                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4000                                 domain_remove_one_dev_info(old_domain, pdev);
4001                         else
4002                                 domain_remove_dev_info(old_domain);
4003                 }
4004         }
4005
4006         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4007                                 pdev->devfn);
4008         if (!iommu)
4009                 return -ENODEV;
4010
4011         /* check if this iommu agaw is sufficient for max mapped address */
4012         addr_width = agaw_to_width(iommu->agaw);
4013         if (addr_width > cap_mgaw(iommu->cap))
4014                 addr_width = cap_mgaw(iommu->cap);
4015
4016         if (dmar_domain->max_addr > (1LL << addr_width)) {
4017                 printk(KERN_ERR "%s: iommu width (%d) is not "
4018                        "sufficient for the mapped address (%llx)\n",
4019                        __func__, addr_width, dmar_domain->max_addr);
4020                 return -EFAULT;
4021         }
4022         dmar_domain->gaw = addr_width;
4023
4024         /*
4025          * Knock out extra levels of page tables if necessary
4026          */
4027         while (iommu->agaw < dmar_domain->agaw) {
4028                 struct dma_pte *pte;
4029
4030                 pte = dmar_domain->pgd;
4031                 if (dma_pte_present(pte)) {
4032                         dmar_domain->pgd = (struct dma_pte *)
4033                                 phys_to_virt(dma_pte_addr(pte));
4034                         free_pgtable_page(pte);
4035                 }
4036                 dmar_domain->agaw--;
4037         }
4038
4039         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4040 }
4041
4042 static void intel_iommu_detach_device(struct iommu_domain *domain,
4043                                       struct device *dev)
4044 {
4045         struct dmar_domain *dmar_domain = domain->priv;
4046         struct pci_dev *pdev = to_pci_dev(dev);
4047
4048         domain_remove_one_dev_info(dmar_domain, pdev);
4049 }
4050
4051 static int intel_iommu_map(struct iommu_domain *domain,
4052                            unsigned long iova, phys_addr_t hpa,
4053                            size_t size, int iommu_prot)
4054 {
4055         struct dmar_domain *dmar_domain = domain->priv;
4056         u64 max_addr;
4057         int prot = 0;
4058         int ret;
4059
4060         if (iommu_prot & IOMMU_READ)
4061                 prot |= DMA_PTE_READ;
4062         if (iommu_prot & IOMMU_WRITE)
4063                 prot |= DMA_PTE_WRITE;
4064         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4065                 prot |= DMA_PTE_SNP;
4066
4067         max_addr = iova + size;
4068         if (dmar_domain->max_addr < max_addr) {
4069                 u64 end;
4070
4071                 /* check if minimum agaw is sufficient for mapped address */
4072                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4073                 if (end < max_addr) {
4074                         printk(KERN_ERR "%s: iommu width (%d) is not "
4075                                "sufficient for the mapped address (%llx)\n",
4076                                __func__, dmar_domain->gaw, max_addr);
4077                         return -EFAULT;
4078                 }
4079                 dmar_domain->max_addr = max_addr;
4080         }
4081         /* Round up size to next multiple of PAGE_SIZE, if it and
4082            the low bits of hpa would take us onto the next page */
4083         size = aligned_nrpages(hpa, size);
4084         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4085                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4086         return ret;
4087 }
4088
4089 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4090                              unsigned long iova, size_t size)
4091 {
4092         struct dmar_domain *dmar_domain = domain->priv;
4093         int order;
4094
4095         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4096                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4097
4098         if (dmar_domain->max_addr == iova + size)
4099                 dmar_domain->max_addr = iova;
4100
4101         return PAGE_SIZE << order;
4102 }
4103
4104 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4105                                             unsigned long iova)
4106 {
4107         struct dmar_domain *dmar_domain = domain->priv;
4108         struct dma_pte *pte;
4109         u64 phys = 0;
4110
4111         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4112         if (pte)
4113                 phys = dma_pte_addr(pte);
4114
4115         return phys;
4116 }
4117
4118 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4119                                       unsigned long cap)
4120 {
4121         struct dmar_domain *dmar_domain = domain->priv;
4122
4123         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4124                 return dmar_domain->iommu_snooping;
4125         if (cap == IOMMU_CAP_INTR_REMAP)
4126                 return irq_remapping_enabled;
4127
4128         return 0;
4129 }
4130
4131 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4132 {
4133         pci_dev_put(*from);
4134         *from = to;
4135 }
4136
4137 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4138
4139 static int intel_iommu_add_device(struct device *dev)
4140 {
4141         struct pci_dev *pdev = to_pci_dev(dev);
4142         struct pci_dev *bridge, *dma_pdev = NULL;
4143         struct iommu_group *group;
4144         int ret;
4145
4146         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4147                              pdev->bus->number, pdev->devfn))
4148                 return -ENODEV;
4149
4150         bridge = pci_find_upstream_pcie_bridge(pdev);
4151         if (bridge) {
4152                 if (pci_is_pcie(bridge))
4153                         dma_pdev = pci_get_domain_bus_and_slot(
4154                                                 pci_domain_nr(pdev->bus),
4155                                                 bridge->subordinate->number, 0);
4156                 if (!dma_pdev)
4157                         dma_pdev = pci_dev_get(bridge);
4158         } else
4159                 dma_pdev = pci_dev_get(pdev);
4160
4161         /* Account for quirked devices */
4162         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4163
4164         /*
4165          * If it's a multifunction device that does not support our
4166          * required ACS flags, add to the same group as function 0.
4167          */
4168         if (dma_pdev->multifunction &&
4169             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4170                 swap_pci_ref(&dma_pdev,
4171                              pci_get_slot(dma_pdev->bus,
4172                                           PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4173                                           0)));
4174
4175         /*
4176          * Devices on the root bus go through the iommu.  If that's not us,
4177          * find the next upstream device and test ACS up to the root bus.
4178          * Finding the next device may require skipping virtual buses.
4179          */
4180         while (!pci_is_root_bus(dma_pdev->bus)) {
4181                 struct pci_bus *bus = dma_pdev->bus;
4182
4183                 while (!bus->self) {
4184                         if (!pci_is_root_bus(bus))
4185                                 bus = bus->parent;
4186                         else
4187                                 goto root_bus;
4188                 }
4189
4190                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4191                         break;
4192
4193                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4194         }
4195
4196 root_bus:
4197         group = iommu_group_get(&dma_pdev->dev);
4198         pci_dev_put(dma_pdev);
4199         if (!group) {
4200                 group = iommu_group_alloc();
4201                 if (IS_ERR(group))
4202                         return PTR_ERR(group);
4203         }
4204
4205         ret = iommu_group_add_device(group, dev);
4206
4207         iommu_group_put(group);
4208         return ret;
4209 }
4210
4211 static void intel_iommu_remove_device(struct device *dev)
4212 {
4213         iommu_group_remove_device(dev);
4214 }
4215
4216 static struct iommu_ops intel_iommu_ops = {
4217         .domain_init    = intel_iommu_domain_init,
4218         .domain_destroy = intel_iommu_domain_destroy,
4219         .attach_dev     = intel_iommu_attach_device,
4220         .detach_dev     = intel_iommu_detach_device,
4221         .map            = intel_iommu_map,
4222         .unmap          = intel_iommu_unmap,
4223         .iova_to_phys   = intel_iommu_iova_to_phys,
4224         .domain_has_cap = intel_iommu_domain_has_cap,
4225         .add_device     = intel_iommu_add_device,
4226         .remove_device  = intel_iommu_remove_device,
4227         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4228 };
4229
4230 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4231 {
4232         /*
4233          * Mobile 4 Series Chipset neglects to set RWBF capability,
4234          * but needs it:
4235          */
4236         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4237         rwbf_quirk = 1;
4238
4239         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4240         if (dev->revision == 0x07) {
4241                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4242                 dmar_map_gfx = 0;
4243         }
4244 }
4245
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4247
4248 #define GGC 0x52
4249 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4250 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4251 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4252 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4253 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4254 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4255 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4256 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4257
4258 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4259 {
4260         unsigned short ggc;
4261
4262         if (pci_read_config_word(dev, GGC, &ggc))
4263                 return;
4264
4265         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4266                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4267                 dmar_map_gfx = 0;
4268         } else if (dmar_map_gfx) {
4269                 /* we have to ensure the gfx device is idle before we flush */
4270                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4271                 intel_iommu_strict = 1;
4272        }
4273 }
4274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4278
4279 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4280    ISOCH DMAR unit for the Azalia sound device, but not give it any
4281    TLB entries, which causes it to deadlock. Check for that.  We do
4282    this in a function called from init_dmars(), instead of in a PCI
4283    quirk, because we don't want to print the obnoxious "BIOS broken"
4284    message if VT-d is actually disabled.
4285 */
4286 static void __init check_tylersburg_isoch(void)
4287 {
4288         struct pci_dev *pdev;
4289         uint32_t vtisochctrl;
4290
4291         /* If there's no Azalia in the system anyway, forget it. */
4292         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4293         if (!pdev)
4294                 return;
4295         pci_dev_put(pdev);
4296
4297         /* System Management Registers. Might be hidden, in which case
4298            we can't do the sanity check. But that's OK, because the
4299            known-broken BIOSes _don't_ actually hide it, so far. */
4300         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4301         if (!pdev)
4302                 return;
4303
4304         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4305                 pci_dev_put(pdev);
4306                 return;
4307         }
4308
4309         pci_dev_put(pdev);
4310
4311         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4312         if (vtisochctrl & 1)
4313                 return;
4314
4315         /* Drop all bits other than the number of TLB entries */
4316         vtisochctrl &= 0x1c;
4317
4318         /* If we have the recommended number of TLB entries (16), fine. */
4319         if (vtisochctrl == 0x10)
4320                 return;
4321
4322         /* Zero TLB entries? You get to ride the short bus to school. */
4323         if (!vtisochctrl) {
4324                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4325                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4326                      dmi_get_system_info(DMI_BIOS_VENDOR),
4327                      dmi_get_system_info(DMI_BIOS_VERSION),
4328                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4329                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4330                 return;
4331         }
4332         
4333         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4334                vtisochctrl);
4335 }