]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/iommu/intel-iommu.c
regmap: debugfs: Fix seeking from the cache
[karo-tx-linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
593
594         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
595
596         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
597                 if (!ecap_coherent(g_iommus[i]->ecap)) {
598                         domain->iommu_coherency = 0;
599                         break;
600                 }
601         }
602 }
603
604 static void domain_update_iommu_snooping(struct dmar_domain *domain)
605 {
606         int i;
607
608         domain->iommu_snooping = 1;
609
610         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
612                         domain->iommu_snooping = 0;
613                         break;
614                 }
615         }
616 }
617
618 static void domain_update_iommu_superpage(struct dmar_domain *domain)
619 {
620         struct dmar_drhd_unit *drhd;
621         struct intel_iommu *iommu = NULL;
622         int mask = 0xf;
623
624         if (!intel_iommu_superpage) {
625                 domain->iommu_superpage = 0;
626                 return;
627         }
628
629         /* set iommu_superpage to the smallest common denominator */
630         for_each_active_iommu(iommu, drhd) {
631                 mask &= cap_super_page_val(iommu->cap);
632                 if (!mask) {
633                         break;
634                 }
635         }
636         domain->iommu_superpage = fls(mask);
637 }
638
639 /* Some capabilities may be different across iommus */
640 static void domain_update_iommu_cap(struct dmar_domain *domain)
641 {
642         domain_update_iommu_coherency(domain);
643         domain_update_iommu_snooping(domain);
644         domain_update_iommu_superpage(domain);
645 }
646
647 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
648 {
649         struct dmar_drhd_unit *drhd = NULL;
650         int i;
651
652         for_each_drhd_unit(drhd) {
653                 if (drhd->ignored)
654                         continue;
655                 if (segment != drhd->segment)
656                         continue;
657
658                 for (i = 0; i < drhd->devices_cnt; i++) {
659                         if (drhd->devices[i] &&
660                             drhd->devices[i]->bus->number == bus &&
661                             drhd->devices[i]->devfn == devfn)
662                                 return drhd->iommu;
663                         if (drhd->devices[i] &&
664                             drhd->devices[i]->subordinate &&
665                             drhd->devices[i]->subordinate->number <= bus &&
666                             drhd->devices[i]->subordinate->busn_res.end >= bus)
667                                 return drhd->iommu;
668                 }
669
670                 if (drhd->include_all)
671                         return drhd->iommu;
672         }
673
674         return NULL;
675 }
676
677 static void domain_flush_cache(struct dmar_domain *domain,
678                                void *addr, int size)
679 {
680         if (!domain->iommu_coherency)
681                 clflush_cache_range(addr, size);
682 }
683
684 /* Gets context entry for a given bus and devfn */
685 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
686                 u8 bus, u8 devfn)
687 {
688         struct root_entry *root;
689         struct context_entry *context;
690         unsigned long phy_addr;
691         unsigned long flags;
692
693         spin_lock_irqsave(&iommu->lock, flags);
694         root = &iommu->root_entry[bus];
695         context = get_context_addr_from_root(root);
696         if (!context) {
697                 context = (struct context_entry *)
698                                 alloc_pgtable_page(iommu->node);
699                 if (!context) {
700                         spin_unlock_irqrestore(&iommu->lock, flags);
701                         return NULL;
702                 }
703                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704                 phy_addr = virt_to_phys((void *)context);
705                 set_root_value(root, phy_addr);
706                 set_root_present(root);
707                 __iommu_flush_cache(iommu, root, sizeof(*root));
708         }
709         spin_unlock_irqrestore(&iommu->lock, flags);
710         return &context[devfn];
711 }
712
713 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
714 {
715         struct root_entry *root;
716         struct context_entry *context;
717         int ret;
718         unsigned long flags;
719
720         spin_lock_irqsave(&iommu->lock, flags);
721         root = &iommu->root_entry[bus];
722         context = get_context_addr_from_root(root);
723         if (!context) {
724                 ret = 0;
725                 goto out;
726         }
727         ret = context_present(&context[devfn]);
728 out:
729         spin_unlock_irqrestore(&iommu->lock, flags);
730         return ret;
731 }
732
733 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
734 {
735         struct root_entry *root;
736         struct context_entry *context;
737         unsigned long flags;
738
739         spin_lock_irqsave(&iommu->lock, flags);
740         root = &iommu->root_entry[bus];
741         context = get_context_addr_from_root(root);
742         if (context) {
743                 context_clear_entry(&context[devfn]);
744                 __iommu_flush_cache(iommu, &context[devfn], \
745                         sizeof(*context));
746         }
747         spin_unlock_irqrestore(&iommu->lock, flags);
748 }
749
750 static void free_context_table(struct intel_iommu *iommu)
751 {
752         struct root_entry *root;
753         int i;
754         unsigned long flags;
755         struct context_entry *context;
756
757         spin_lock_irqsave(&iommu->lock, flags);
758         if (!iommu->root_entry) {
759                 goto out;
760         }
761         for (i = 0; i < ROOT_ENTRY_NR; i++) {
762                 root = &iommu->root_entry[i];
763                 context = get_context_addr_from_root(root);
764                 if (context)
765                         free_pgtable_page(context);
766         }
767         free_pgtable_page(iommu->root_entry);
768         iommu->root_entry = NULL;
769 out:
770         spin_unlock_irqrestore(&iommu->lock, flags);
771 }
772
773 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
774                                       unsigned long pfn, int target_level)
775 {
776         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
777         struct dma_pte *parent, *pte = NULL;
778         int level = agaw_to_level(domain->agaw);
779         int offset;
780
781         BUG_ON(!domain->pgd);
782         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
783         parent = domain->pgd;
784
785         while (level > 0) {
786                 void *tmp_page;
787
788                 offset = pfn_level_offset(pfn, level);
789                 pte = &parent[offset];
790                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791                         break;
792                 if (level == target_level)
793                         break;
794
795                 if (!dma_pte_present(pte)) {
796                         uint64_t pteval;
797
798                         tmp_page = alloc_pgtable_page(domain->nid);
799
800                         if (!tmp_page)
801                                 return NULL;
802
803                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
806                                 /* Someone else set it while we were thinking; use theirs. */
807                                 free_pgtable_page(tmp_page);
808                         } else {
809                                 dma_pte_addr(pte);
810                                 domain_flush_cache(domain, pte, sizeof(*pte));
811                         }
812                 }
813                 parent = phys_to_virt(dma_pte_addr(pte));
814                 level--;
815         }
816
817         return pte;
818 }
819
820
821 /* return address's pte at specific level */
822 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
823                                          unsigned long pfn,
824                                          int level, int *large_page)
825 {
826         struct dma_pte *parent, *pte = NULL;
827         int total = agaw_to_level(domain->agaw);
828         int offset;
829
830         parent = domain->pgd;
831         while (level <= total) {
832                 offset = pfn_level_offset(pfn, total);
833                 pte = &parent[offset];
834                 if (level == total)
835                         return pte;
836
837                 if (!dma_pte_present(pte)) {
838                         *large_page = total;
839                         break;
840                 }
841
842                 if (pte->val & DMA_PTE_LARGE_PAGE) {
843                         *large_page = total;
844                         return pte;
845                 }
846
847                 parent = phys_to_virt(dma_pte_addr(pte));
848                 total--;
849         }
850         return NULL;
851 }
852
853 /* clear last level pte, a tlb flush should be followed */
854 static int dma_pte_clear_range(struct dmar_domain *domain,
855                                 unsigned long start_pfn,
856                                 unsigned long last_pfn)
857 {
858         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
859         unsigned int large_page = 1;
860         struct dma_pte *first_pte, *pte;
861         int order;
862
863         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
864         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
865         BUG_ON(start_pfn > last_pfn);
866
867         /* we don't need lock here; nobody else touches the iova range */
868         do {
869                 large_page = 1;
870                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
871                 if (!pte) {
872                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
873                         continue;
874                 }
875                 do {
876                         dma_clear_pte(pte);
877                         start_pfn += lvl_to_nr_pages(large_page);
878                         pte++;
879                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
880
881                 domain_flush_cache(domain, first_pte,
882                                    (void *)pte - (void *)first_pte);
883
884         } while (start_pfn && start_pfn <= last_pfn);
885
886         order = (large_page - 1) * 9;
887         return order;
888 }
889
890 /* free page table pages. last level pte should already be cleared */
891 static void dma_pte_free_pagetable(struct dmar_domain *domain,
892                                    unsigned long start_pfn,
893                                    unsigned long last_pfn)
894 {
895         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896         struct dma_pte *first_pte, *pte;
897         int total = agaw_to_level(domain->agaw);
898         int level;
899         unsigned long tmp;
900         int large_page = 2;
901
902         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
903         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
904         BUG_ON(start_pfn > last_pfn);
905
906         /* We don't need lock here; nobody else touches the iova range */
907         level = 2;
908         while (level <= total) {
909                 tmp = align_to_level(start_pfn, level);
910
911                 /* If we can't even clear one PTE at this level, we're done */
912                 if (tmp + level_size(level) - 1 > last_pfn)
913                         return;
914
915                 do {
916                         large_page = level;
917                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
918                         if (large_page > level)
919                                 level = large_page + 1;
920                         if (!pte) {
921                                 tmp = align_to_level(tmp + 1, level + 1);
922                                 continue;
923                         }
924                         do {
925                                 if (dma_pte_present(pte)) {
926                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
927                                         dma_clear_pte(pte);
928                                 }
929                                 pte++;
930                                 tmp += level_size(level);
931                         } while (!first_pte_in_page(pte) &&
932                                  tmp + level_size(level) - 1 <= last_pfn);
933
934                         domain_flush_cache(domain, first_pte,
935                                            (void *)pte - (void *)first_pte);
936                         
937                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
938                 level++;
939         }
940         /* free pgd */
941         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
942                 free_pgtable_page(domain->pgd);
943                 domain->pgd = NULL;
944         }
945 }
946
947 /* iommu handling */
948 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
949 {
950         struct root_entry *root;
951         unsigned long flags;
952
953         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
954         if (!root)
955                 return -ENOMEM;
956
957         __iommu_flush_cache(iommu, root, ROOT_SIZE);
958
959         spin_lock_irqsave(&iommu->lock, flags);
960         iommu->root_entry = root;
961         spin_unlock_irqrestore(&iommu->lock, flags);
962
963         return 0;
964 }
965
966 static void iommu_set_root_entry(struct intel_iommu *iommu)
967 {
968         void *addr;
969         u32 sts;
970         unsigned long flag;
971
972         addr = iommu->root_entry;
973
974         raw_spin_lock_irqsave(&iommu->register_lock, flag);
975         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
976
977         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
978
979         /* Make sure hardware complete it */
980         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
981                       readl, (sts & DMA_GSTS_RTPS), sts);
982
983         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
984 }
985
986 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
987 {
988         u32 val;
989         unsigned long flag;
990
991         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
992                 return;
993
994         raw_spin_lock_irqsave(&iommu->register_lock, flag);
995         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
996
997         /* Make sure hardware complete it */
998         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
999                       readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002 }
1003
1004 /* return value determine if we need a write buffer flush */
1005 static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                   u16 did, u16 source_id, u8 function_mask,
1007                                   u64 type)
1008 {
1009         u64 val = 0;
1010         unsigned long flag;
1011
1012         switch (type) {
1013         case DMA_CCMD_GLOBAL_INVL:
1014                 val = DMA_CCMD_GLOBAL_INVL;
1015                 break;
1016         case DMA_CCMD_DOMAIN_INVL:
1017                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                 break;
1019         case DMA_CCMD_DEVICE_INVL:
1020                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                 break;
1023         default:
1024                 BUG();
1025         }
1026         val |= DMA_CCMD_ICC;
1027
1028         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031         /* Make sure hardware complete it */
1032         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 }
1037
1038 /* return value determine if we need a write buffer flush */
1039 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                 u64 addr, unsigned int size_order, u64 type)
1041 {
1042         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043         u64 val = 0, val_iva = 0;
1044         unsigned long flag;
1045
1046         switch (type) {
1047         case DMA_TLB_GLOBAL_FLUSH:
1048                 /* global flush doesn't need set IVA_REG */
1049                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                 break;
1051         case DMA_TLB_DSI_FLUSH:
1052                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                 break;
1054         case DMA_TLB_PSI_FLUSH:
1055                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                 /* Note: always flush non-leaf currently */
1057                 val_iva = size_order | addr;
1058                 break;
1059         default:
1060                 BUG();
1061         }
1062         /* Note: set drain read/write */
1063 #if 0
1064         /*
1065          * This is probably to be super secure.. Looks like we can
1066          * ignore it without any impact.
1067          */
1068         if (cap_read_drain(iommu->cap))
1069                 val |= DMA_TLB_READ_DRAIN;
1070 #endif
1071         if (cap_write_drain(iommu->cap))
1072                 val |= DMA_TLB_WRITE_DRAIN;
1073
1074         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075         /* Note: Only uses first TLB reg currently */
1076         if (val_iva)
1077                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080         /* Make sure hardware complete it */
1081         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086         /* check IOTLB invalidation granularity */
1087         if (DMA_TLB_IAIG(val) == 0)
1088                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                         (unsigned long long)DMA_TLB_IIRG(type),
1092                         (unsigned long long)DMA_TLB_IAIG(val));
1093 }
1094
1095 static struct device_domain_info *iommu_support_dev_iotlb(
1096         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097 {
1098         int found = 0;
1099         unsigned long flags;
1100         struct device_domain_info *info;
1101         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103         if (!ecap_dev_iotlb_support(iommu->ecap))
1104                 return NULL;
1105
1106         if (!iommu->qi)
1107                 return NULL;
1108
1109         spin_lock_irqsave(&device_domain_lock, flags);
1110         list_for_each_entry(info, &domain->devices, link)
1111                 if (info->bus == bus && info->devfn == devfn) {
1112                         found = 1;
1113                         break;
1114                 }
1115         spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117         if (!found || !info->dev)
1118                 return NULL;
1119
1120         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                 return NULL;
1122
1123         if (!dmar_find_matched_atsr_unit(info->dev))
1124                 return NULL;
1125
1126         info->iommu = iommu;
1127
1128         return info;
1129 }
1130
1131 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132 {
1133         if (!info)
1134                 return;
1135
1136         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137 }
1138
1139 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140 {
1141         if (!info->dev || !pci_ats_enabled(info->dev))
1142                 return;
1143
1144         pci_disable_ats(info->dev);
1145 }
1146
1147 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                   u64 addr, unsigned mask)
1149 {
1150         u16 sid, qdep;
1151         unsigned long flags;
1152         struct device_domain_info *info;
1153
1154         spin_lock_irqsave(&device_domain_lock, flags);
1155         list_for_each_entry(info, &domain->devices, link) {
1156                 if (!info->dev || !pci_ats_enabled(info->dev))
1157                         continue;
1158
1159                 sid = info->bus << 8 | info->devfn;
1160                 qdep = pci_ats_queue_depth(info->dev);
1161                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162         }
1163         spin_unlock_irqrestore(&device_domain_lock, flags);
1164 }
1165
1166 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                   unsigned long pfn, unsigned int pages, int map)
1168 {
1169         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172         BUG_ON(pages == 0);
1173
1174         /*
1175          * Fallback to domain selective flush if no PSI support or the size is
1176          * too big.
1177          * PSI requires page size to be 2 ^ x, and the base address is naturally
1178          * aligned to the size
1179          */
1180         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                 DMA_TLB_DSI_FLUSH);
1183         else
1184                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                 DMA_TLB_PSI_FLUSH);
1186
1187         /*
1188          * In caching mode, changes of pages from non-present to present require
1189          * flush. However, device IOTLB doesn't need to be flushed in this case.
1190          */
1191         if (!cap_caching_mode(iommu->cap) || !map)
1192                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193 }
1194
1195 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196 {
1197         u32 pmen;
1198         unsigned long flags;
1199
1200         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202         pmen &= ~DMA_PMEN_EPM;
1203         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205         /* wait for the protected region status bit to clear */
1206         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210 }
1211
1212 static int iommu_enable_translation(struct intel_iommu *iommu)
1213 {
1214         u32 sts;
1215         unsigned long flags;
1216
1217         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218         iommu->gcmd |= DMA_GCMD_TE;
1219         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_TES), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226         return 0;
1227 }
1228
1229 static int iommu_disable_translation(struct intel_iommu *iommu)
1230 {
1231         u32 sts;
1232         unsigned long flag;
1233
1234         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235         iommu->gcmd &= ~DMA_GCMD_TE;
1236         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238         /* Make sure hardware complete it */
1239         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                       readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243         return 0;
1244 }
1245
1246
1247 static int iommu_init_domains(struct intel_iommu *iommu)
1248 {
1249         unsigned long ndomains;
1250         unsigned long nlongs;
1251
1252         ndomains = cap_ndoms(iommu->cap);
1253         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                         ndomains);
1255         nlongs = BITS_TO_LONGS(ndomains);
1256
1257         spin_lock_init(&iommu->lock);
1258
1259         /* TBD: there might be 64K domains,
1260          * consider other allocation for future chip
1261          */
1262         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263         if (!iommu->domain_ids) {
1264                 printk(KERN_ERR "Allocating domain id array failed\n");
1265                 return -ENOMEM;
1266         }
1267         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                         GFP_KERNEL);
1269         if (!iommu->domains) {
1270                 printk(KERN_ERR "Allocating domain array failed\n");
1271                 return -ENOMEM;
1272         }
1273
1274         /*
1275          * if Caching mode is set, then invalid translations are tagged
1276          * with domainid 0. Hence we need to pre-allocate it.
1277          */
1278         if (cap_caching_mode(iommu->cap))
1279                 set_bit(0, iommu->domain_ids);
1280         return 0;
1281 }
1282
1283
1284 static void domain_exit(struct dmar_domain *domain);
1285 static void vm_domain_exit(struct dmar_domain *domain);
1286
1287 void free_dmar_iommu(struct intel_iommu *iommu)
1288 {
1289         struct dmar_domain *domain;
1290         int i;
1291         unsigned long flags;
1292
1293         if ((iommu->domains) && (iommu->domain_ids)) {
1294                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                         domain = iommu->domains[i];
1296                         clear_bit(i, iommu->domain_ids);
1297
1298                         spin_lock_irqsave(&domain->iommu_lock, flags);
1299                         if (--domain->iommu_count == 0) {
1300                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                         vm_domain_exit(domain);
1302                                 else
1303                                         domain_exit(domain);
1304                         }
1305                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                 }
1307         }
1308
1309         if (iommu->gcmd & DMA_GCMD_TE)
1310                 iommu_disable_translation(iommu);
1311
1312         if (iommu->irq) {
1313                 irq_set_handler_data(iommu->irq, NULL);
1314                 /* This will mask the irq */
1315                 free_irq(iommu->irq, iommu);
1316                 destroy_irq(iommu->irq);
1317         }
1318
1319         kfree(iommu->domains);
1320         kfree(iommu->domain_ids);
1321
1322         g_iommus[iommu->seq_id] = NULL;
1323
1324         /* if all iommus are freed, free g_iommus */
1325         for (i = 0; i < g_num_of_iommus; i++) {
1326                 if (g_iommus[i])
1327                         break;
1328         }
1329
1330         if (i == g_num_of_iommus)
1331                 kfree(g_iommus);
1332
1333         /* free context mapping */
1334         free_context_table(iommu);
1335 }
1336
1337 static struct dmar_domain *alloc_domain(void)
1338 {
1339         struct dmar_domain *domain;
1340
1341         domain = alloc_domain_mem();
1342         if (!domain)
1343                 return NULL;
1344
1345         domain->nid = -1;
1346         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347         domain->flags = 0;
1348
1349         return domain;
1350 }
1351
1352 static int iommu_attach_domain(struct dmar_domain *domain,
1353                                struct intel_iommu *iommu)
1354 {
1355         int num;
1356         unsigned long ndomains;
1357         unsigned long flags;
1358
1359         ndomains = cap_ndoms(iommu->cap);
1360
1361         spin_lock_irqsave(&iommu->lock, flags);
1362
1363         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364         if (num >= ndomains) {
1365                 spin_unlock_irqrestore(&iommu->lock, flags);
1366                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                 return -ENOMEM;
1368         }
1369
1370         domain->id = num;
1371         set_bit(num, iommu->domain_ids);
1372         set_bit(iommu->seq_id, domain->iommu_bmp);
1373         iommu->domains[num] = domain;
1374         spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376         return 0;
1377 }
1378
1379 static void iommu_detach_domain(struct dmar_domain *domain,
1380                                 struct intel_iommu *iommu)
1381 {
1382         unsigned long flags;
1383         int num, ndomains;
1384         int found = 0;
1385
1386         spin_lock_irqsave(&iommu->lock, flags);
1387         ndomains = cap_ndoms(iommu->cap);
1388         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                 if (iommu->domains[num] == domain) {
1390                         found = 1;
1391                         break;
1392                 }
1393         }
1394
1395         if (found) {
1396                 clear_bit(num, iommu->domain_ids);
1397                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                 iommu->domains[num] = NULL;
1399         }
1400         spin_unlock_irqrestore(&iommu->lock, flags);
1401 }
1402
1403 static struct iova_domain reserved_iova_list;
1404 static struct lock_class_key reserved_rbtree_key;
1405
1406 static int dmar_init_reserved_ranges(void)
1407 {
1408         struct pci_dev *pdev = NULL;
1409         struct iova *iova;
1410         int i;
1411
1412         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                 &reserved_rbtree_key);
1416
1417         /* IOAPIC ranges shouldn't be accessed by DMA */
1418         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                 IOVA_PFN(IOAPIC_RANGE_END));
1420         if (!iova) {
1421                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                 return -ENODEV;
1423         }
1424
1425         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426         for_each_pci_dev(pdev) {
1427                 struct resource *r;
1428
1429                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                         r = &pdev->resource[i];
1431                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                 continue;
1433                         iova = reserve_iova(&reserved_iova_list,
1434                                             IOVA_PFN(r->start),
1435                                             IOVA_PFN(r->end));
1436                         if (!iova) {
1437                                 printk(KERN_ERR "Reserve iova failed\n");
1438                                 return -ENODEV;
1439                         }
1440                 }
1441         }
1442         return 0;
1443 }
1444
1445 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446 {
1447         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448 }
1449
1450 static inline int guestwidth_to_adjustwidth(int gaw)
1451 {
1452         int agaw;
1453         int r = (gaw - 12) % 9;
1454
1455         if (r == 0)
1456                 agaw = gaw;
1457         else
1458                 agaw = gaw + 9 - r;
1459         if (agaw > 64)
1460                 agaw = 64;
1461         return agaw;
1462 }
1463
1464 static int domain_init(struct dmar_domain *domain, int guest_width)
1465 {
1466         struct intel_iommu *iommu;
1467         int adjust_width, agaw;
1468         unsigned long sagaw;
1469
1470         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471         spin_lock_init(&domain->iommu_lock);
1472
1473         domain_reserve_special_ranges(domain);
1474
1475         /* calculate AGAW */
1476         iommu = domain_get_iommu(domain);
1477         if (guest_width > cap_mgaw(iommu->cap))
1478                 guest_width = cap_mgaw(iommu->cap);
1479         domain->gaw = guest_width;
1480         adjust_width = guestwidth_to_adjustwidth(guest_width);
1481         agaw = width_to_agaw(adjust_width);
1482         sagaw = cap_sagaw(iommu->cap);
1483         if (!test_bit(agaw, &sagaw)) {
1484                 /* hardware doesn't support it, choose a bigger one */
1485                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                 agaw = find_next_bit(&sagaw, 5, agaw);
1487                 if (agaw >= 5)
1488                         return -ENODEV;
1489         }
1490         domain->agaw = agaw;
1491         INIT_LIST_HEAD(&domain->devices);
1492
1493         if (ecap_coherent(iommu->ecap))
1494                 domain->iommu_coherency = 1;
1495         else
1496                 domain->iommu_coherency = 0;
1497
1498         if (ecap_sc_support(iommu->ecap))
1499                 domain->iommu_snooping = 1;
1500         else
1501                 domain->iommu_snooping = 0;
1502
1503         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504         domain->iommu_count = 1;
1505         domain->nid = iommu->node;
1506
1507         /* always allocate the top pgd */
1508         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509         if (!domain->pgd)
1510                 return -ENOMEM;
1511         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512         return 0;
1513 }
1514
1515 static void domain_exit(struct dmar_domain *domain)
1516 {
1517         struct dmar_drhd_unit *drhd;
1518         struct intel_iommu *iommu;
1519
1520         /* Domain 0 is reserved, so dont process it */
1521         if (!domain)
1522                 return;
1523
1524         /* Flush any lazy unmaps that may reference this domain */
1525         if (!intel_iommu_strict)
1526                 flush_unmaps_timeout(0);
1527
1528         domain_remove_dev_info(domain);
1529         /* destroy iovas */
1530         put_iova_domain(&domain->iovad);
1531
1532         /* clear ptes */
1533         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535         /* free page tables */
1536         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538         for_each_active_iommu(iommu, drhd)
1539                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                         iommu_detach_domain(domain, iommu);
1541
1542         free_domain_mem(domain);
1543 }
1544
1545 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                  u8 bus, u8 devfn, int translation)
1547 {
1548         struct context_entry *context;
1549         unsigned long flags;
1550         struct intel_iommu *iommu;
1551         struct dma_pte *pgd;
1552         unsigned long num;
1553         unsigned long ndomains;
1554         int id;
1555         int agaw;
1556         struct device_domain_info *info = NULL;
1557
1558         pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561         BUG_ON(!domain->pgd);
1562         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563                translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565         iommu = device_to_iommu(segment, bus, devfn);
1566         if (!iommu)
1567                 return -ENODEV;
1568
1569         context = device_to_context_entry(iommu, bus, devfn);
1570         if (!context)
1571                 return -ENOMEM;
1572         spin_lock_irqsave(&iommu->lock, flags);
1573         if (context_present(context)) {
1574                 spin_unlock_irqrestore(&iommu->lock, flags);
1575                 return 0;
1576         }
1577
1578         id = domain->id;
1579         pgd = domain->pgd;
1580
1581         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                 int found = 0;
1584
1585                 /* find an available domain id for this device in iommu */
1586                 ndomains = cap_ndoms(iommu->cap);
1587                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                         if (iommu->domains[num] == domain) {
1589                                 id = num;
1590                                 found = 1;
1591                                 break;
1592                         }
1593                 }
1594
1595                 if (found == 0) {
1596                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                         if (num >= ndomains) {
1598                                 spin_unlock_irqrestore(&iommu->lock, flags);
1599                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                 return -EFAULT;
1601                         }
1602
1603                         set_bit(num, iommu->domain_ids);
1604                         iommu->domains[num] = domain;
1605                         id = num;
1606                 }
1607
1608                 /* Skip top levels of page tables for
1609                  * iommu which has less agaw than default.
1610                  * Unnecessary for PT mode.
1611                  */
1612                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                 if (!dma_pte_present(pgd)) {
1616                                         spin_unlock_irqrestore(&iommu->lock, flags);
1617                                         return -ENOMEM;
1618                                 }
1619                         }
1620                 }
1621         }
1622
1623         context_set_domain_id(context, id);
1624
1625         if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                      CONTEXT_TT_MULTI_LEVEL;
1629         }
1630         /*
1631          * In pass through mode, AW must be programmed to indicate the largest
1632          * AGAW value supported by hardware. And ASR is ignored by hardware.
1633          */
1634         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                 context_set_address_width(context, iommu->msagaw);
1636         else {
1637                 context_set_address_root(context, virt_to_phys(pgd));
1638                 context_set_address_width(context, iommu->agaw);
1639         }
1640
1641         context_set_translation_type(context, translation);
1642         context_set_fault_enable(context);
1643         context_set_present(context);
1644         domain_flush_cache(domain, context, sizeof(*context));
1645
1646         /*
1647          * It's a non-present to present mapping. If hardware doesn't cache
1648          * non-present entry we only need to flush the write-buffer. If the
1649          * _does_ cache non-present entries, then it does so in the special
1650          * domain #0, which we have to flush:
1651          */
1652         if (cap_caching_mode(iommu->cap)) {
1653                 iommu->flush.flush_context(iommu, 0,
1654                                            (((u16)bus) << 8) | devfn,
1655                                            DMA_CCMD_MASK_NOBIT,
1656                                            DMA_CCMD_DEVICE_INVL);
1657                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658         } else {
1659                 iommu_flush_write_buffer(iommu);
1660         }
1661         iommu_enable_dev_iotlb(info);
1662         spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664         spin_lock_irqsave(&domain->iommu_lock, flags);
1665         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                 domain->iommu_count++;
1667                 if (domain->iommu_count == 1)
1668                         domain->nid = iommu->node;
1669                 domain_update_iommu_cap(domain);
1670         }
1671         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672         return 0;
1673 }
1674
1675 static int
1676 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                         int translation)
1678 {
1679         int ret;
1680         struct pci_dev *tmp, *parent;
1681
1682         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                          pdev->bus->number, pdev->devfn,
1684                                          translation);
1685         if (ret)
1686                 return ret;
1687
1688         /* dependent device mapping */
1689         tmp = pci_find_upstream_pcie_bridge(pdev);
1690         if (!tmp)
1691                 return 0;
1692         /* Secondary interface's bus number and devfn 0 */
1693         parent = pdev->bus->self;
1694         while (parent != tmp) {
1695                 ret = domain_context_mapping_one(domain,
1696                                                  pci_domain_nr(parent->bus),
1697                                                  parent->bus->number,
1698                                                  parent->devfn, translation);
1699                 if (ret)
1700                         return ret;
1701                 parent = parent->bus->self;
1702         }
1703         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                 return domain_context_mapping_one(domain,
1705                                         pci_domain_nr(tmp->subordinate),
1706                                         tmp->subordinate->number, 0,
1707                                         translation);
1708         else /* this is a legacy PCI bridge */
1709                 return domain_context_mapping_one(domain,
1710                                                   pci_domain_nr(tmp->bus),
1711                                                   tmp->bus->number,
1712                                                   tmp->devfn,
1713                                                   translation);
1714 }
1715
1716 static int domain_context_mapped(struct pci_dev *pdev)
1717 {
1718         int ret;
1719         struct pci_dev *tmp, *parent;
1720         struct intel_iommu *iommu;
1721
1722         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                 pdev->devfn);
1724         if (!iommu)
1725                 return -ENODEV;
1726
1727         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728         if (!ret)
1729                 return ret;
1730         /* dependent device mapping */
1731         tmp = pci_find_upstream_pcie_bridge(pdev);
1732         if (!tmp)
1733                 return ret;
1734         /* Secondary interface's bus number and devfn 0 */
1735         parent = pdev->bus->self;
1736         while (parent != tmp) {
1737                 ret = device_context_mapped(iommu, parent->bus->number,
1738                                             parent->devfn);
1739                 if (!ret)
1740                         return ret;
1741                 parent = parent->bus->self;
1742         }
1743         if (pci_is_pcie(tmp))
1744                 return device_context_mapped(iommu, tmp->subordinate->number,
1745                                              0);
1746         else
1747                 return device_context_mapped(iommu, tmp->bus->number,
1748                                              tmp->devfn);
1749 }
1750
1751 /* Returns a number of VTD pages, but aligned to MM page size */
1752 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                             size_t size)
1754 {
1755         host_addr &= ~PAGE_MASK;
1756         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757 }
1758
1759 /* Return largest possible superpage level for a given mapping */
1760 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                           unsigned long iov_pfn,
1762                                           unsigned long phy_pfn,
1763                                           unsigned long pages)
1764 {
1765         int support, level = 1;
1766         unsigned long pfnmerge;
1767
1768         support = domain->iommu_superpage;
1769
1770         /* To use a large page, the virtual *and* physical addresses
1771            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772            of them will mean we have to use smaller pages. So just
1773            merge them and check both at once. */
1774         pfnmerge = iov_pfn | phy_pfn;
1775
1776         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                 pages >>= VTD_STRIDE_SHIFT;
1778                 if (!pages)
1779                         break;
1780                 pfnmerge >>= VTD_STRIDE_SHIFT;
1781                 level++;
1782                 support--;
1783         }
1784         return level;
1785 }
1786
1787 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                             struct scatterlist *sg, unsigned long phys_pfn,
1789                             unsigned long nr_pages, int prot)
1790 {
1791         struct dma_pte *first_pte = NULL, *pte = NULL;
1792         phys_addr_t uninitialized_var(pteval);
1793         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794         unsigned long sg_res;
1795         unsigned int largepage_lvl = 0;
1796         unsigned long lvl_pages = 0;
1797
1798         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                 return -EINVAL;
1802
1803         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805         if (sg)
1806                 sg_res = 0;
1807         else {
1808                 sg_res = nr_pages + 1;
1809                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810         }
1811
1812         while (nr_pages > 0) {
1813                 uint64_t tmp;
1814
1815                 if (!sg_res) {
1816                         sg_res = aligned_nrpages(sg->offset, sg->length);
1817                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                         sg->dma_length = sg->length;
1819                         pteval = page_to_phys(sg_page(sg)) | prot;
1820                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                 }
1822
1823                 if (!pte) {
1824                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                         if (!pte)
1828                                 return -ENOMEM;
1829                         /* It is large page*/
1830                         if (largepage_lvl > 1) {
1831                                 pteval |= DMA_PTE_LARGE_PAGE;
1832                                 /* Ensure that old small page tables are removed to make room
1833                                    for superpage, if they exist. */
1834                                 dma_pte_clear_range(domain, iov_pfn,
1835                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1836                                 dma_pte_free_pagetable(domain, iov_pfn,
1837                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                         } else {
1839                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1840                         }
1841
1842                 }
1843                 /* We don't need lock here, nobody else
1844                  * touches the iova range
1845                  */
1846                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1847                 if (tmp) {
1848                         static int dumps = 5;
1849                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1850                                iov_pfn, tmp, (unsigned long long)pteval);
1851                         if (dumps) {
1852                                 dumps--;
1853                                 debug_dma_dump_mappings(NULL);
1854                         }
1855                         WARN_ON(1);
1856                 }
1857
1858                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1859
1860                 BUG_ON(nr_pages < lvl_pages);
1861                 BUG_ON(sg_res < lvl_pages);
1862
1863                 nr_pages -= lvl_pages;
1864                 iov_pfn += lvl_pages;
1865                 phys_pfn += lvl_pages;
1866                 pteval += lvl_pages * VTD_PAGE_SIZE;
1867                 sg_res -= lvl_pages;
1868
1869                 /* If the next PTE would be the first in a new page, then we
1870                    need to flush the cache on the entries we've just written.
1871                    And then we'll need to recalculate 'pte', so clear it and
1872                    let it get set again in the if (!pte) block above.
1873
1874                    If we're done (!nr_pages) we need to flush the cache too.
1875
1876                    Also if we've been setting superpages, we may need to
1877                    recalculate 'pte' and switch back to smaller pages for the
1878                    end of the mapping, if the trailing size is not enough to
1879                    use another superpage (i.e. sg_res < lvl_pages). */
1880                 pte++;
1881                 if (!nr_pages || first_pte_in_page(pte) ||
1882                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1883                         domain_flush_cache(domain, first_pte,
1884                                            (void *)pte - (void *)first_pte);
1885                         pte = NULL;
1886                 }
1887
1888                 if (!sg_res && nr_pages)
1889                         sg = sg_next(sg);
1890         }
1891         return 0;
1892 }
1893
1894 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                     struct scatterlist *sg, unsigned long nr_pages,
1896                                     int prot)
1897 {
1898         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1899 }
1900
1901 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1902                                      unsigned long phys_pfn, unsigned long nr_pages,
1903                                      int prot)
1904 {
1905         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1906 }
1907
1908 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1909 {
1910         if (!iommu)
1911                 return;
1912
1913         clear_context_table(iommu, bus, devfn);
1914         iommu->flush.flush_context(iommu, 0, 0, 0,
1915                                            DMA_CCMD_GLOBAL_INVL);
1916         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1917 }
1918
1919 static inline void unlink_domain_info(struct device_domain_info *info)
1920 {
1921         assert_spin_locked(&device_domain_lock);
1922         list_del(&info->link);
1923         list_del(&info->global);
1924         if (info->dev)
1925                 info->dev->dev.archdata.iommu = NULL;
1926 }
1927
1928 static void domain_remove_dev_info(struct dmar_domain *domain)
1929 {
1930         struct device_domain_info *info;
1931         unsigned long flags;
1932         struct intel_iommu *iommu;
1933
1934         spin_lock_irqsave(&device_domain_lock, flags);
1935         while (!list_empty(&domain->devices)) {
1936                 info = list_entry(domain->devices.next,
1937                         struct device_domain_info, link);
1938                 unlink_domain_info(info);
1939                 spin_unlock_irqrestore(&device_domain_lock, flags);
1940
1941                 iommu_disable_dev_iotlb(info);
1942                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1943                 iommu_detach_dev(iommu, info->bus, info->devfn);
1944                 free_devinfo_mem(info);
1945
1946                 spin_lock_irqsave(&device_domain_lock, flags);
1947         }
1948         spin_unlock_irqrestore(&device_domain_lock, flags);
1949 }
1950
1951 /*
1952  * find_domain
1953  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1954  */
1955 static struct dmar_domain *
1956 find_domain(struct pci_dev *pdev)
1957 {
1958         struct device_domain_info *info;
1959
1960         /* No lock here, assumes no domain exit in normal case */
1961         info = pdev->dev.archdata.iommu;
1962         if (info)
1963                 return info->domain;
1964         return NULL;
1965 }
1966
1967 /* domain is initialized */
1968 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1969 {
1970         struct dmar_domain *domain, *found = NULL;
1971         struct intel_iommu *iommu;
1972         struct dmar_drhd_unit *drhd;
1973         struct device_domain_info *info, *tmp;
1974         struct pci_dev *dev_tmp;
1975         unsigned long flags;
1976         int bus = 0, devfn = 0;
1977         int segment;
1978         int ret;
1979
1980         domain = find_domain(pdev);
1981         if (domain)
1982                 return domain;
1983
1984         segment = pci_domain_nr(pdev->bus);
1985
1986         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1987         if (dev_tmp) {
1988                 if (pci_is_pcie(dev_tmp)) {
1989                         bus = dev_tmp->subordinate->number;
1990                         devfn = 0;
1991                 } else {
1992                         bus = dev_tmp->bus->number;
1993                         devfn = dev_tmp->devfn;
1994                 }
1995                 spin_lock_irqsave(&device_domain_lock, flags);
1996                 list_for_each_entry(info, &device_domain_list, global) {
1997                         if (info->segment == segment &&
1998                             info->bus == bus && info->devfn == devfn) {
1999                                 found = info->domain;
2000                                 break;
2001                         }
2002                 }
2003                 spin_unlock_irqrestore(&device_domain_lock, flags);
2004                 /* pcie-pci bridge already has a domain, uses it */
2005                 if (found) {
2006                         domain = found;
2007                         goto found_domain;
2008                 }
2009         }
2010
2011         domain = alloc_domain();
2012         if (!domain)
2013                 goto error;
2014
2015         /* Allocate new domain for the device */
2016         drhd = dmar_find_matched_drhd_unit(pdev);
2017         if (!drhd) {
2018                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2019                         pci_name(pdev));
2020                 free_domain_mem(domain);
2021                 return NULL;
2022         }
2023         iommu = drhd->iommu;
2024
2025         ret = iommu_attach_domain(domain, iommu);
2026         if (ret) {
2027                 free_domain_mem(domain);
2028                 goto error;
2029         }
2030
2031         if (domain_init(domain, gaw)) {
2032                 domain_exit(domain);
2033                 goto error;
2034         }
2035
2036         /* register pcie-to-pci device */
2037         if (dev_tmp) {
2038                 info = alloc_devinfo_mem();
2039                 if (!info) {
2040                         domain_exit(domain);
2041                         goto error;
2042                 }
2043                 info->segment = segment;
2044                 info->bus = bus;
2045                 info->devfn = devfn;
2046                 info->dev = NULL;
2047                 info->domain = domain;
2048                 /* This domain is shared by devices under p2p bridge */
2049                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2050
2051                 /* pcie-to-pci bridge already has a domain, uses it */
2052                 found = NULL;
2053                 spin_lock_irqsave(&device_domain_lock, flags);
2054                 list_for_each_entry(tmp, &device_domain_list, global) {
2055                         if (tmp->segment == segment &&
2056                             tmp->bus == bus && tmp->devfn == devfn) {
2057                                 found = tmp->domain;
2058                                 break;
2059                         }
2060                 }
2061                 if (found) {
2062                         spin_unlock_irqrestore(&device_domain_lock, flags);
2063                         free_devinfo_mem(info);
2064                         domain_exit(domain);
2065                         domain = found;
2066                 } else {
2067                         list_add(&info->link, &domain->devices);
2068                         list_add(&info->global, &device_domain_list);
2069                         spin_unlock_irqrestore(&device_domain_lock, flags);
2070                 }
2071         }
2072
2073 found_domain:
2074         info = alloc_devinfo_mem();
2075         if (!info)
2076                 goto error;
2077         info->segment = segment;
2078         info->bus = pdev->bus->number;
2079         info->devfn = pdev->devfn;
2080         info->dev = pdev;
2081         info->domain = domain;
2082         spin_lock_irqsave(&device_domain_lock, flags);
2083         /* somebody is fast */
2084         found = find_domain(pdev);
2085         if (found != NULL) {
2086                 spin_unlock_irqrestore(&device_domain_lock, flags);
2087                 if (found != domain) {
2088                         domain_exit(domain);
2089                         domain = found;
2090                 }
2091                 free_devinfo_mem(info);
2092                 return domain;
2093         }
2094         list_add(&info->link, &domain->devices);
2095         list_add(&info->global, &device_domain_list);
2096         pdev->dev.archdata.iommu = info;
2097         spin_unlock_irqrestore(&device_domain_lock, flags);
2098         return domain;
2099 error:
2100         /* recheck it here, maybe others set it */
2101         return find_domain(pdev);
2102 }
2103
2104 static int iommu_identity_mapping;
2105 #define IDENTMAP_ALL            1
2106 #define IDENTMAP_GFX            2
2107 #define IDENTMAP_AZALIA         4
2108
2109 static int iommu_domain_identity_map(struct dmar_domain *domain,
2110                                      unsigned long long start,
2111                                      unsigned long long end)
2112 {
2113         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2114         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2115
2116         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2117                           dma_to_mm_pfn(last_vpfn))) {
2118                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2119                 return -ENOMEM;
2120         }
2121
2122         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2123                  start, end, domain->id);
2124         /*
2125          * RMRR range might have overlap with physical memory range,
2126          * clear it first
2127          */
2128         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2129
2130         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2131                                   last_vpfn - first_vpfn + 1,
2132                                   DMA_PTE_READ|DMA_PTE_WRITE);
2133 }
2134
2135 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2136                                       unsigned long long start,
2137                                       unsigned long long end)
2138 {
2139         struct dmar_domain *domain;
2140         int ret;
2141
2142         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2143         if (!domain)
2144                 return -ENOMEM;
2145
2146         /* For _hardware_ passthrough, don't bother. But for software
2147            passthrough, we do it anyway -- it may indicate a memory
2148            range which is reserved in E820, so which didn't get set
2149            up to start with in si_domain */
2150         if (domain == si_domain && hw_pass_through) {
2151                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2152                        pci_name(pdev), start, end);
2153                 return 0;
2154         }
2155
2156         printk(KERN_INFO
2157                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2158                pci_name(pdev), start, end);
2159         
2160         if (end < start) {
2161                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2162                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2163                         dmi_get_system_info(DMI_BIOS_VENDOR),
2164                         dmi_get_system_info(DMI_BIOS_VERSION),
2165                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2166                 ret = -EIO;
2167                 goto error;
2168         }
2169
2170         if (end >> agaw_to_width(domain->agaw)) {
2171                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2172                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2173                      agaw_to_width(domain->agaw),
2174                      dmi_get_system_info(DMI_BIOS_VENDOR),
2175                      dmi_get_system_info(DMI_BIOS_VERSION),
2176                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2177                 ret = -EIO;
2178                 goto error;
2179         }
2180
2181         ret = iommu_domain_identity_map(domain, start, end);
2182         if (ret)
2183                 goto error;
2184
2185         /* context entry init */
2186         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2187         if (ret)
2188                 goto error;
2189
2190         return 0;
2191
2192  error:
2193         domain_exit(domain);
2194         return ret;
2195 }
2196
2197 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2198         struct pci_dev *pdev)
2199 {
2200         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2201                 return 0;
2202         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2203                 rmrr->end_address);
2204 }
2205
2206 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2207 static inline void iommu_prepare_isa(void)
2208 {
2209         struct pci_dev *pdev;
2210         int ret;
2211
2212         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2213         if (!pdev)
2214                 return;
2215
2216         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2217         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2218
2219         if (ret)
2220                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2221                        "floppy might not work\n");
2222
2223 }
2224 #else
2225 static inline void iommu_prepare_isa(void)
2226 {
2227         return;
2228 }
2229 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2230
2231 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2232
2233 static int __init si_domain_init(int hw)
2234 {
2235         struct dmar_drhd_unit *drhd;
2236         struct intel_iommu *iommu;
2237         int nid, ret = 0;
2238
2239         si_domain = alloc_domain();
2240         if (!si_domain)
2241                 return -EFAULT;
2242
2243         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2244
2245         for_each_active_iommu(iommu, drhd) {
2246                 ret = iommu_attach_domain(si_domain, iommu);
2247                 if (ret) {
2248                         domain_exit(si_domain);
2249                         return -EFAULT;
2250                 }
2251         }
2252
2253         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2254                 domain_exit(si_domain);
2255                 return -EFAULT;
2256         }
2257
2258         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2259
2260         if (hw)
2261                 return 0;
2262
2263         for_each_online_node(nid) {
2264                 unsigned long start_pfn, end_pfn;
2265                 int i;
2266
2267                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2268                         ret = iommu_domain_identity_map(si_domain,
2269                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2270                         if (ret)
2271                                 return ret;
2272                 }
2273         }
2274
2275         return 0;
2276 }
2277
2278 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2279                                           struct pci_dev *pdev);
2280 static int identity_mapping(struct pci_dev *pdev)
2281 {
2282         struct device_domain_info *info;
2283
2284         if (likely(!iommu_identity_mapping))
2285                 return 0;
2286
2287         info = pdev->dev.archdata.iommu;
2288         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2289                 return (info->domain == si_domain);
2290
2291         return 0;
2292 }
2293
2294 static int domain_add_dev_info(struct dmar_domain *domain,
2295                                struct pci_dev *pdev,
2296                                int translation)
2297 {
2298         struct device_domain_info *info;
2299         unsigned long flags;
2300         int ret;
2301
2302         info = alloc_devinfo_mem();
2303         if (!info)
2304                 return -ENOMEM;
2305
2306         info->segment = pci_domain_nr(pdev->bus);
2307         info->bus = pdev->bus->number;
2308         info->devfn = pdev->devfn;
2309         info->dev = pdev;
2310         info->domain = domain;
2311
2312         spin_lock_irqsave(&device_domain_lock, flags);
2313         list_add(&info->link, &domain->devices);
2314         list_add(&info->global, &device_domain_list);
2315         pdev->dev.archdata.iommu = info;
2316         spin_unlock_irqrestore(&device_domain_lock, flags);
2317
2318         ret = domain_context_mapping(domain, pdev, translation);
2319         if (ret) {
2320                 spin_lock_irqsave(&device_domain_lock, flags);
2321                 unlink_domain_info(info);
2322                 spin_unlock_irqrestore(&device_domain_lock, flags);
2323                 free_devinfo_mem(info);
2324                 return ret;
2325         }
2326
2327         return 0;
2328 }
2329
2330 static bool device_has_rmrr(struct pci_dev *dev)
2331 {
2332         struct dmar_rmrr_unit *rmrr;
2333         int i;
2334
2335         for_each_rmrr_units(rmrr) {
2336                 for (i = 0; i < rmrr->devices_cnt; i++) {
2337                         /*
2338                          * Return TRUE if this RMRR contains the device that
2339                          * is passed in.
2340                          */
2341                         if (rmrr->devices[i] == dev)
2342                                 return true;
2343                 }
2344         }
2345         return false;
2346 }
2347
2348 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2349 {
2350
2351         /*
2352          * We want to prevent any device associated with an RMRR from
2353          * getting placed into the SI Domain. This is done because
2354          * problems exist when devices are moved in and out of domains
2355          * and their respective RMRR info is lost. We exempt USB devices
2356          * from this process due to their usage of RMRRs that are known
2357          * to not be needed after BIOS hand-off to OS.
2358          */
2359         if (device_has_rmrr(pdev) &&
2360             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2361                 return 0;
2362
2363         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2364                 return 1;
2365
2366         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2367                 return 1;
2368
2369         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2370                 return 0;
2371
2372         /*
2373          * We want to start off with all devices in the 1:1 domain, and
2374          * take them out later if we find they can't access all of memory.
2375          *
2376          * However, we can't do this for PCI devices behind bridges,
2377          * because all PCI devices behind the same bridge will end up
2378          * with the same source-id on their transactions.
2379          *
2380          * Practically speaking, we can't change things around for these
2381          * devices at run-time, because we can't be sure there'll be no
2382          * DMA transactions in flight for any of their siblings.
2383          * 
2384          * So PCI devices (unless they're on the root bus) as well as
2385          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2386          * the 1:1 domain, just in _case_ one of their siblings turns out
2387          * not to be able to map all of memory.
2388          */
2389         if (!pci_is_pcie(pdev)) {
2390                 if (!pci_is_root_bus(pdev->bus))
2391                         return 0;
2392                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2393                         return 0;
2394         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2395                 return 0;
2396
2397         /* 
2398          * At boot time, we don't yet know if devices will be 64-bit capable.
2399          * Assume that they will -- if they turn out not to be, then we can 
2400          * take them out of the 1:1 domain later.
2401          */
2402         if (!startup) {
2403                 /*
2404                  * If the device's dma_mask is less than the system's memory
2405                  * size then this is not a candidate for identity mapping.
2406                  */
2407                 u64 dma_mask = pdev->dma_mask;
2408
2409                 if (pdev->dev.coherent_dma_mask &&
2410                     pdev->dev.coherent_dma_mask < dma_mask)
2411                         dma_mask = pdev->dev.coherent_dma_mask;
2412
2413                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2414         }
2415
2416         return 1;
2417 }
2418
2419 static int __init iommu_prepare_static_identity_mapping(int hw)
2420 {
2421         struct pci_dev *pdev = NULL;
2422         int ret;
2423
2424         ret = si_domain_init(hw);
2425         if (ret)
2426                 return -EFAULT;
2427
2428         for_each_pci_dev(pdev) {
2429                 if (iommu_should_identity_map(pdev, 1)) {
2430                         ret = domain_add_dev_info(si_domain, pdev,
2431                                              hw ? CONTEXT_TT_PASS_THROUGH :
2432                                                   CONTEXT_TT_MULTI_LEVEL);
2433                         if (ret) {
2434                                 /* device not associated with an iommu */
2435                                 if (ret == -ENODEV)
2436                                         continue;
2437                                 return ret;
2438                         }
2439                         pr_info("IOMMU: %s identity mapping for device %s\n",
2440                                 hw ? "hardware" : "software", pci_name(pdev));
2441                 }
2442         }
2443
2444         return 0;
2445 }
2446
2447 static int __init init_dmars(void)
2448 {
2449         struct dmar_drhd_unit *drhd;
2450         struct dmar_rmrr_unit *rmrr;
2451         struct pci_dev *pdev;
2452         struct intel_iommu *iommu;
2453         int i, ret;
2454
2455         /*
2456          * for each drhd
2457          *    allocate root
2458          *    initialize and program root entry to not present
2459          * endfor
2460          */
2461         for_each_drhd_unit(drhd) {
2462                 /*
2463                  * lock not needed as this is only incremented in the single
2464                  * threaded kernel __init code path all other access are read
2465                  * only
2466                  */
2467                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2468                         g_num_of_iommus++;
2469                         continue;
2470                 }
2471                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2472                           IOMMU_UNITS_SUPPORTED);
2473         }
2474
2475         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2476                         GFP_KERNEL);
2477         if (!g_iommus) {
2478                 printk(KERN_ERR "Allocating global iommu array failed\n");
2479                 ret = -ENOMEM;
2480                 goto error;
2481         }
2482
2483         deferred_flush = kzalloc(g_num_of_iommus *
2484                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2485         if (!deferred_flush) {
2486                 ret = -ENOMEM;
2487                 goto error;
2488         }
2489
2490         for_each_drhd_unit(drhd) {
2491                 if (drhd->ignored)
2492                         continue;
2493
2494                 iommu = drhd->iommu;
2495                 g_iommus[iommu->seq_id] = iommu;
2496
2497                 ret = iommu_init_domains(iommu);
2498                 if (ret)
2499                         goto error;
2500
2501                 /*
2502                  * TBD:
2503                  * we could share the same root & context tables
2504                  * among all IOMMU's. Need to Split it later.
2505                  */
2506                 ret = iommu_alloc_root_entry(iommu);
2507                 if (ret) {
2508                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2509                         goto error;
2510                 }
2511                 if (!ecap_pass_through(iommu->ecap))
2512                         hw_pass_through = 0;
2513         }
2514
2515         /*
2516          * Start from the sane iommu hardware state.
2517          */
2518         for_each_drhd_unit(drhd) {
2519                 if (drhd->ignored)
2520                         continue;
2521
2522                 iommu = drhd->iommu;
2523
2524                 /*
2525                  * If the queued invalidation is already initialized by us
2526                  * (for example, while enabling interrupt-remapping) then
2527                  * we got the things already rolling from a sane state.
2528                  */
2529                 if (iommu->qi)
2530                         continue;
2531
2532                 /*
2533                  * Clear any previous faults.
2534                  */
2535                 dmar_fault(-1, iommu);
2536                 /*
2537                  * Disable queued invalidation if supported and already enabled
2538                  * before OS handover.
2539                  */
2540                 dmar_disable_qi(iommu);
2541         }
2542
2543         for_each_drhd_unit(drhd) {
2544                 if (drhd->ignored)
2545                         continue;
2546
2547                 iommu = drhd->iommu;
2548
2549                 if (dmar_enable_qi(iommu)) {
2550                         /*
2551                          * Queued Invalidate not enabled, use Register Based
2552                          * Invalidate
2553                          */
2554                         iommu->flush.flush_context = __iommu_flush_context;
2555                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2556                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2557                                "invalidation\n",
2558                                 iommu->seq_id,
2559                                (unsigned long long)drhd->reg_base_addr);
2560                 } else {
2561                         iommu->flush.flush_context = qi_flush_context;
2562                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2563                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2564                                "invalidation\n",
2565                                 iommu->seq_id,
2566                                (unsigned long long)drhd->reg_base_addr);
2567                 }
2568         }
2569
2570         if (iommu_pass_through)
2571                 iommu_identity_mapping |= IDENTMAP_ALL;
2572
2573 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2574         iommu_identity_mapping |= IDENTMAP_GFX;
2575 #endif
2576
2577         check_tylersburg_isoch();
2578
2579         /*
2580          * If pass through is not set or not enabled, setup context entries for
2581          * identity mappings for rmrr, gfx, and isa and may fall back to static
2582          * identity mapping if iommu_identity_mapping is set.
2583          */
2584         if (iommu_identity_mapping) {
2585                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2586                 if (ret) {
2587                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2588                         goto error;
2589                 }
2590         }
2591         /*
2592          * For each rmrr
2593          *   for each dev attached to rmrr
2594          *   do
2595          *     locate drhd for dev, alloc domain for dev
2596          *     allocate free domain
2597          *     allocate page table entries for rmrr
2598          *     if context not allocated for bus
2599          *           allocate and init context
2600          *           set present in root table for this bus
2601          *     init context with domain, translation etc
2602          *    endfor
2603          * endfor
2604          */
2605         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2606         for_each_rmrr_units(rmrr) {
2607                 for (i = 0; i < rmrr->devices_cnt; i++) {
2608                         pdev = rmrr->devices[i];
2609                         /*
2610                          * some BIOS lists non-exist devices in DMAR
2611                          * table.
2612                          */
2613                         if (!pdev)
2614                                 continue;
2615                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2616                         if (ret)
2617                                 printk(KERN_ERR
2618                                        "IOMMU: mapping reserved region failed\n");
2619                 }
2620         }
2621
2622         iommu_prepare_isa();
2623
2624         /*
2625          * for each drhd
2626          *   enable fault log
2627          *   global invalidate context cache
2628          *   global invalidate iotlb
2629          *   enable translation
2630          */
2631         for_each_drhd_unit(drhd) {
2632                 if (drhd->ignored) {
2633                         /*
2634                          * we always have to disable PMRs or DMA may fail on
2635                          * this device
2636                          */
2637                         if (force_on)
2638                                 iommu_disable_protect_mem_regions(drhd->iommu);
2639                         continue;
2640                 }
2641                 iommu = drhd->iommu;
2642
2643                 iommu_flush_write_buffer(iommu);
2644
2645                 ret = dmar_set_interrupt(iommu);
2646                 if (ret)
2647                         goto error;
2648
2649                 iommu_set_root_entry(iommu);
2650
2651                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2652                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2653
2654                 ret = iommu_enable_translation(iommu);
2655                 if (ret)
2656                         goto error;
2657
2658                 iommu_disable_protect_mem_regions(iommu);
2659         }
2660
2661         return 0;
2662 error:
2663         for_each_drhd_unit(drhd) {
2664                 if (drhd->ignored)
2665                         continue;
2666                 iommu = drhd->iommu;
2667                 free_iommu(iommu);
2668         }
2669         kfree(g_iommus);
2670         return ret;
2671 }
2672
2673 /* This takes a number of _MM_ pages, not VTD pages */
2674 static struct iova *intel_alloc_iova(struct device *dev,
2675                                      struct dmar_domain *domain,
2676                                      unsigned long nrpages, uint64_t dma_mask)
2677 {
2678         struct pci_dev *pdev = to_pci_dev(dev);
2679         struct iova *iova = NULL;
2680
2681         /* Restrict dma_mask to the width that the iommu can handle */
2682         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2683
2684         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2685                 /*
2686                  * First try to allocate an io virtual address in
2687                  * DMA_BIT_MASK(32) and if that fails then try allocating
2688                  * from higher range
2689                  */
2690                 iova = alloc_iova(&domain->iovad, nrpages,
2691                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2692                 if (iova)
2693                         return iova;
2694         }
2695         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2696         if (unlikely(!iova)) {
2697                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2698                        nrpages, pci_name(pdev));
2699                 return NULL;
2700         }
2701
2702         return iova;
2703 }
2704
2705 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2706 {
2707         struct dmar_domain *domain;
2708         int ret;
2709
2710         domain = get_domain_for_dev(pdev,
2711                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2712         if (!domain) {
2713                 printk(KERN_ERR
2714                         "Allocating domain for %s failed", pci_name(pdev));
2715                 return NULL;
2716         }
2717
2718         /* make sure context mapping is ok */
2719         if (unlikely(!domain_context_mapped(pdev))) {
2720                 ret = domain_context_mapping(domain, pdev,
2721                                              CONTEXT_TT_MULTI_LEVEL);
2722                 if (ret) {
2723                         printk(KERN_ERR
2724                                 "Domain context map for %s failed",
2725                                 pci_name(pdev));
2726                         return NULL;
2727                 }
2728         }
2729
2730         return domain;
2731 }
2732
2733 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2734 {
2735         struct device_domain_info *info;
2736
2737         /* No lock here, assumes no domain exit in normal case */
2738         info = dev->dev.archdata.iommu;
2739         if (likely(info))
2740                 return info->domain;
2741
2742         return __get_valid_domain_for_dev(dev);
2743 }
2744
2745 static int iommu_dummy(struct pci_dev *pdev)
2746 {
2747         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2748 }
2749
2750 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2751 static int iommu_no_mapping(struct device *dev)
2752 {
2753         struct pci_dev *pdev;
2754         int found;
2755
2756         if (unlikely(dev->bus != &pci_bus_type))
2757                 return 1;
2758
2759         pdev = to_pci_dev(dev);
2760         if (iommu_dummy(pdev))
2761                 return 1;
2762
2763         if (!iommu_identity_mapping)
2764                 return 0;
2765
2766         found = identity_mapping(pdev);
2767         if (found) {
2768                 if (iommu_should_identity_map(pdev, 0))
2769                         return 1;
2770                 else {
2771                         /*
2772                          * 32 bit DMA is removed from si_domain and fall back
2773                          * to non-identity mapping.
2774                          */
2775                         domain_remove_one_dev_info(si_domain, pdev);
2776                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2777                                pci_name(pdev));
2778                         return 0;
2779                 }
2780         } else {
2781                 /*
2782                  * In case of a detached 64 bit DMA device from vm, the device
2783                  * is put into si_domain for identity mapping.
2784                  */
2785                 if (iommu_should_identity_map(pdev, 0)) {
2786                         int ret;
2787                         ret = domain_add_dev_info(si_domain, pdev,
2788                                                   hw_pass_through ?
2789                                                   CONTEXT_TT_PASS_THROUGH :
2790                                                   CONTEXT_TT_MULTI_LEVEL);
2791                         if (!ret) {
2792                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2793                                        pci_name(pdev));
2794                                 return 1;
2795                         }
2796                 }
2797         }
2798
2799         return 0;
2800 }
2801
2802 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2803                                      size_t size, int dir, u64 dma_mask)
2804 {
2805         struct pci_dev *pdev = to_pci_dev(hwdev);
2806         struct dmar_domain *domain;
2807         phys_addr_t start_paddr;
2808         struct iova *iova;
2809         int prot = 0;
2810         int ret;
2811         struct intel_iommu *iommu;
2812         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2813
2814         BUG_ON(dir == DMA_NONE);
2815
2816         if (iommu_no_mapping(hwdev))
2817                 return paddr;
2818
2819         domain = get_valid_domain_for_dev(pdev);
2820         if (!domain)
2821                 return 0;
2822
2823         iommu = domain_get_iommu(domain);
2824         size = aligned_nrpages(paddr, size);
2825
2826         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2827         if (!iova)
2828                 goto error;
2829
2830         /*
2831          * Check if DMAR supports zero-length reads on write only
2832          * mappings..
2833          */
2834         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2835                         !cap_zlr(iommu->cap))
2836                 prot |= DMA_PTE_READ;
2837         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2838                 prot |= DMA_PTE_WRITE;
2839         /*
2840          * paddr - (paddr + size) might be partial page, we should map the whole
2841          * page.  Note: if two part of one page are separately mapped, we
2842          * might have two guest_addr mapping to the same host paddr, but this
2843          * is not a big problem
2844          */
2845         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2846                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2847         if (ret)
2848                 goto error;
2849
2850         /* it's a non-present to present mapping. Only flush if caching mode */
2851         if (cap_caching_mode(iommu->cap))
2852                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2853         else
2854                 iommu_flush_write_buffer(iommu);
2855
2856         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2857         start_paddr += paddr & ~PAGE_MASK;
2858         return start_paddr;
2859
2860 error:
2861         if (iova)
2862                 __free_iova(&domain->iovad, iova);
2863         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2864                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2865         return 0;
2866 }
2867
2868 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2869                                  unsigned long offset, size_t size,
2870                                  enum dma_data_direction dir,
2871                                  struct dma_attrs *attrs)
2872 {
2873         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2874                                   dir, to_pci_dev(dev)->dma_mask);
2875 }
2876
2877 static void flush_unmaps(void)
2878 {
2879         int i, j;
2880
2881         timer_on = 0;
2882
2883         /* just flush them all */
2884         for (i = 0; i < g_num_of_iommus; i++) {
2885                 struct intel_iommu *iommu = g_iommus[i];
2886                 if (!iommu)
2887                         continue;
2888
2889                 if (!deferred_flush[i].next)
2890                         continue;
2891
2892                 /* In caching mode, global flushes turn emulation expensive */
2893                 if (!cap_caching_mode(iommu->cap))
2894                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2895                                          DMA_TLB_GLOBAL_FLUSH);
2896                 for (j = 0; j < deferred_flush[i].next; j++) {
2897                         unsigned long mask;
2898                         struct iova *iova = deferred_flush[i].iova[j];
2899                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2900
2901                         /* On real hardware multiple invalidations are expensive */
2902                         if (cap_caching_mode(iommu->cap))
2903                                 iommu_flush_iotlb_psi(iommu, domain->id,
2904                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2905                         else {
2906                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2907                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2908                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2909                         }
2910                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2911                 }
2912                 deferred_flush[i].next = 0;
2913         }
2914
2915         list_size = 0;
2916 }
2917
2918 static void flush_unmaps_timeout(unsigned long data)
2919 {
2920         unsigned long flags;
2921
2922         spin_lock_irqsave(&async_umap_flush_lock, flags);
2923         flush_unmaps();
2924         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2925 }
2926
2927 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2928 {
2929         unsigned long flags;
2930         int next, iommu_id;
2931         struct intel_iommu *iommu;
2932
2933         spin_lock_irqsave(&async_umap_flush_lock, flags);
2934         if (list_size == HIGH_WATER_MARK)
2935                 flush_unmaps();
2936
2937         iommu = domain_get_iommu(dom);
2938         iommu_id = iommu->seq_id;
2939
2940         next = deferred_flush[iommu_id].next;
2941         deferred_flush[iommu_id].domain[next] = dom;
2942         deferred_flush[iommu_id].iova[next] = iova;
2943         deferred_flush[iommu_id].next++;
2944
2945         if (!timer_on) {
2946                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2947                 timer_on = 1;
2948         }
2949         list_size++;
2950         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2951 }
2952
2953 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2954                              size_t size, enum dma_data_direction dir,
2955                              struct dma_attrs *attrs)
2956 {
2957         struct pci_dev *pdev = to_pci_dev(dev);
2958         struct dmar_domain *domain;
2959         unsigned long start_pfn, last_pfn;
2960         struct iova *iova;
2961         struct intel_iommu *iommu;
2962
2963         if (iommu_no_mapping(dev))
2964                 return;
2965
2966         domain = find_domain(pdev);
2967         BUG_ON(!domain);
2968
2969         iommu = domain_get_iommu(domain);
2970
2971         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2972         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2973                       (unsigned long long)dev_addr))
2974                 return;
2975
2976         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2977         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2978
2979         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2980                  pci_name(pdev), start_pfn, last_pfn);
2981
2982         /*  clear the whole page */
2983         dma_pte_clear_range(domain, start_pfn, last_pfn);
2984
2985         /* free page tables */
2986         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2987
2988         if (intel_iommu_strict) {
2989                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2990                                       last_pfn - start_pfn + 1, 0);
2991                 /* free iova */
2992                 __free_iova(&domain->iovad, iova);
2993         } else {
2994                 add_unmap(domain, iova);
2995                 /*
2996                  * queue up the release of the unmap to save the 1/6th of the
2997                  * cpu used up by the iotlb flush operation...
2998                  */
2999         }
3000 }
3001
3002 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3003                                   dma_addr_t *dma_handle, gfp_t flags,
3004                                   struct dma_attrs *attrs)
3005 {
3006         void *vaddr;
3007         int order;
3008
3009         size = PAGE_ALIGN(size);
3010         order = get_order(size);
3011
3012         if (!iommu_no_mapping(hwdev))
3013                 flags &= ~(GFP_DMA | GFP_DMA32);
3014         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3015                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3016                         flags |= GFP_DMA;
3017                 else
3018                         flags |= GFP_DMA32;
3019         }
3020
3021         vaddr = (void *)__get_free_pages(flags, order);
3022         if (!vaddr)
3023                 return NULL;
3024         memset(vaddr, 0, size);
3025
3026         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3027                                          DMA_BIDIRECTIONAL,
3028                                          hwdev->coherent_dma_mask);
3029         if (*dma_handle)
3030                 return vaddr;
3031         free_pages((unsigned long)vaddr, order);
3032         return NULL;
3033 }
3034
3035 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3036                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3037 {
3038         int order;
3039
3040         size = PAGE_ALIGN(size);
3041         order = get_order(size);
3042
3043         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3044         free_pages((unsigned long)vaddr, order);
3045 }
3046
3047 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3048                            int nelems, enum dma_data_direction dir,
3049                            struct dma_attrs *attrs)
3050 {
3051         struct pci_dev *pdev = to_pci_dev(hwdev);
3052         struct dmar_domain *domain;
3053         unsigned long start_pfn, last_pfn;
3054         struct iova *iova;
3055         struct intel_iommu *iommu;
3056
3057         if (iommu_no_mapping(hwdev))
3058                 return;
3059
3060         domain = find_domain(pdev);
3061         BUG_ON(!domain);
3062
3063         iommu = domain_get_iommu(domain);
3064
3065         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3066         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3067                       (unsigned long long)sglist[0].dma_address))
3068                 return;
3069
3070         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3071         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3072
3073         /*  clear the whole page */
3074         dma_pte_clear_range(domain, start_pfn, last_pfn);
3075
3076         /* free page tables */
3077         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3078
3079         if (intel_iommu_strict) {
3080                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3081                                       last_pfn - start_pfn + 1, 0);
3082                 /* free iova */
3083                 __free_iova(&domain->iovad, iova);
3084         } else {
3085                 add_unmap(domain, iova);
3086                 /*
3087                  * queue up the release of the unmap to save the 1/6th of the
3088                  * cpu used up by the iotlb flush operation...
3089                  */
3090         }
3091 }
3092
3093 static int intel_nontranslate_map_sg(struct device *hddev,
3094         struct scatterlist *sglist, int nelems, int dir)
3095 {
3096         int i;
3097         struct scatterlist *sg;
3098
3099         for_each_sg(sglist, sg, nelems, i) {
3100                 BUG_ON(!sg_page(sg));
3101                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3102                 sg->dma_length = sg->length;
3103         }
3104         return nelems;
3105 }
3106
3107 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3108                         enum dma_data_direction dir, struct dma_attrs *attrs)
3109 {
3110         int i;
3111         struct pci_dev *pdev = to_pci_dev(hwdev);
3112         struct dmar_domain *domain;
3113         size_t size = 0;
3114         int prot = 0;
3115         struct iova *iova = NULL;
3116         int ret;
3117         struct scatterlist *sg;
3118         unsigned long start_vpfn;
3119         struct intel_iommu *iommu;
3120
3121         BUG_ON(dir == DMA_NONE);
3122         if (iommu_no_mapping(hwdev))
3123                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3124
3125         domain = get_valid_domain_for_dev(pdev);
3126         if (!domain)
3127                 return 0;
3128
3129         iommu = domain_get_iommu(domain);
3130
3131         for_each_sg(sglist, sg, nelems, i)
3132                 size += aligned_nrpages(sg->offset, sg->length);
3133
3134         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3135                                 pdev->dma_mask);
3136         if (!iova) {
3137                 sglist->dma_length = 0;
3138                 return 0;
3139         }
3140
3141         /*
3142          * Check if DMAR supports zero-length reads on write only
3143          * mappings..
3144          */
3145         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3146                         !cap_zlr(iommu->cap))
3147                 prot |= DMA_PTE_READ;
3148         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3149                 prot |= DMA_PTE_WRITE;
3150
3151         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3152
3153         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3154         if (unlikely(ret)) {
3155                 /*  clear the page */
3156                 dma_pte_clear_range(domain, start_vpfn,
3157                                     start_vpfn + size - 1);
3158                 /* free page tables */
3159                 dma_pte_free_pagetable(domain, start_vpfn,
3160                                        start_vpfn + size - 1);
3161                 /* free iova */
3162                 __free_iova(&domain->iovad, iova);
3163                 return 0;
3164         }
3165
3166         /* it's a non-present to present mapping. Only flush if caching mode */
3167         if (cap_caching_mode(iommu->cap))
3168                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3169         else
3170                 iommu_flush_write_buffer(iommu);
3171
3172         return nelems;
3173 }
3174
3175 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3176 {
3177         return !dma_addr;
3178 }
3179
3180 struct dma_map_ops intel_dma_ops = {
3181         .alloc = intel_alloc_coherent,
3182         .free = intel_free_coherent,
3183         .map_sg = intel_map_sg,
3184         .unmap_sg = intel_unmap_sg,
3185         .map_page = intel_map_page,
3186         .unmap_page = intel_unmap_page,
3187         .mapping_error = intel_mapping_error,
3188 };
3189
3190 static inline int iommu_domain_cache_init(void)
3191 {
3192         int ret = 0;
3193
3194         iommu_domain_cache = kmem_cache_create("iommu_domain",
3195                                          sizeof(struct dmar_domain),
3196                                          0,
3197                                          SLAB_HWCACHE_ALIGN,
3198
3199                                          NULL);
3200         if (!iommu_domain_cache) {
3201                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3202                 ret = -ENOMEM;
3203         }
3204
3205         return ret;
3206 }
3207
3208 static inline int iommu_devinfo_cache_init(void)
3209 {
3210         int ret = 0;
3211
3212         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3213                                          sizeof(struct device_domain_info),
3214                                          0,
3215                                          SLAB_HWCACHE_ALIGN,
3216                                          NULL);
3217         if (!iommu_devinfo_cache) {
3218                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3219                 ret = -ENOMEM;
3220         }
3221
3222         return ret;
3223 }
3224
3225 static inline int iommu_iova_cache_init(void)
3226 {
3227         int ret = 0;
3228
3229         iommu_iova_cache = kmem_cache_create("iommu_iova",
3230                                          sizeof(struct iova),
3231                                          0,
3232                                          SLAB_HWCACHE_ALIGN,
3233                                          NULL);
3234         if (!iommu_iova_cache) {
3235                 printk(KERN_ERR "Couldn't create iova cache\n");
3236                 ret = -ENOMEM;
3237         }
3238
3239         return ret;
3240 }
3241
3242 static int __init iommu_init_mempool(void)
3243 {
3244         int ret;
3245         ret = iommu_iova_cache_init();
3246         if (ret)
3247                 return ret;
3248
3249         ret = iommu_domain_cache_init();
3250         if (ret)
3251                 goto domain_error;
3252
3253         ret = iommu_devinfo_cache_init();
3254         if (!ret)
3255                 return ret;
3256
3257         kmem_cache_destroy(iommu_domain_cache);
3258 domain_error:
3259         kmem_cache_destroy(iommu_iova_cache);
3260
3261         return -ENOMEM;
3262 }
3263
3264 static void __init iommu_exit_mempool(void)
3265 {
3266         kmem_cache_destroy(iommu_devinfo_cache);
3267         kmem_cache_destroy(iommu_domain_cache);
3268         kmem_cache_destroy(iommu_iova_cache);
3269
3270 }
3271
3272 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3273 {
3274         struct dmar_drhd_unit *drhd;
3275         u32 vtbar;
3276         int rc;
3277
3278         /* We know that this device on this chipset has its own IOMMU.
3279          * If we find it under a different IOMMU, then the BIOS is lying
3280          * to us. Hope that the IOMMU for this device is actually
3281          * disabled, and it needs no translation...
3282          */
3283         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3284         if (rc) {
3285                 /* "can't" happen */
3286                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3287                 return;
3288         }
3289         vtbar &= 0xffff0000;
3290
3291         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3292         drhd = dmar_find_matched_drhd_unit(pdev);
3293         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3294                             TAINT_FIRMWARE_WORKAROUND,
3295                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3296                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3297 }
3298 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3299
3300 static void __init init_no_remapping_devices(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303
3304         for_each_drhd_unit(drhd) {
3305                 if (!drhd->include_all) {
3306                         int i;
3307                         for (i = 0; i < drhd->devices_cnt; i++)
3308                                 if (drhd->devices[i] != NULL)
3309                                         break;
3310                         /* ignore DMAR unit if no pci devices exist */
3311                         if (i == drhd->devices_cnt)
3312                                 drhd->ignored = 1;
3313                 }
3314         }
3315
3316         for_each_drhd_unit(drhd) {
3317                 int i;
3318                 if (drhd->ignored || drhd->include_all)
3319                         continue;
3320
3321                 for (i = 0; i < drhd->devices_cnt; i++)
3322                         if (drhd->devices[i] &&
3323                             !IS_GFX_DEVICE(drhd->devices[i]))
3324                                 break;
3325
3326                 if (i < drhd->devices_cnt)
3327                         continue;
3328
3329                 /* This IOMMU has *only* gfx devices. Either bypass it or
3330                    set the gfx_mapped flag, as appropriate */
3331                 if (dmar_map_gfx) {
3332                         intel_iommu_gfx_mapped = 1;
3333                 } else {
3334                         drhd->ignored = 1;
3335                         for (i = 0; i < drhd->devices_cnt; i++) {
3336                                 if (!drhd->devices[i])
3337                                         continue;
3338                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3339                         }
3340                 }
3341         }
3342 }
3343
3344 #ifdef CONFIG_SUSPEND
3345 static int init_iommu_hw(void)
3346 {
3347         struct dmar_drhd_unit *drhd;
3348         struct intel_iommu *iommu = NULL;
3349
3350         for_each_active_iommu(iommu, drhd)
3351                 if (iommu->qi)
3352                         dmar_reenable_qi(iommu);
3353
3354         for_each_iommu(iommu, drhd) {
3355                 if (drhd->ignored) {
3356                         /*
3357                          * we always have to disable PMRs or DMA may fail on
3358                          * this device
3359                          */
3360                         if (force_on)
3361                                 iommu_disable_protect_mem_regions(iommu);
3362                         continue;
3363                 }
3364         
3365                 iommu_flush_write_buffer(iommu);
3366
3367                 iommu_set_root_entry(iommu);
3368
3369                 iommu->flush.flush_context(iommu, 0, 0, 0,
3370                                            DMA_CCMD_GLOBAL_INVL);
3371                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3372                                          DMA_TLB_GLOBAL_FLUSH);
3373                 if (iommu_enable_translation(iommu))
3374                         return 1;
3375                 iommu_disable_protect_mem_regions(iommu);
3376         }
3377
3378         return 0;
3379 }
3380
3381 static void iommu_flush_all(void)
3382 {
3383         struct dmar_drhd_unit *drhd;
3384         struct intel_iommu *iommu;
3385
3386         for_each_active_iommu(iommu, drhd) {
3387                 iommu->flush.flush_context(iommu, 0, 0, 0,
3388                                            DMA_CCMD_GLOBAL_INVL);
3389                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3390                                          DMA_TLB_GLOBAL_FLUSH);
3391         }
3392 }
3393
3394 static int iommu_suspend(void)
3395 {
3396         struct dmar_drhd_unit *drhd;
3397         struct intel_iommu *iommu = NULL;
3398         unsigned long flag;
3399
3400         for_each_active_iommu(iommu, drhd) {
3401                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3402                                                  GFP_ATOMIC);
3403                 if (!iommu->iommu_state)
3404                         goto nomem;
3405         }
3406
3407         iommu_flush_all();
3408
3409         for_each_active_iommu(iommu, drhd) {
3410                 iommu_disable_translation(iommu);
3411
3412                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3413
3414                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3415                         readl(iommu->reg + DMAR_FECTL_REG);
3416                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3417                         readl(iommu->reg + DMAR_FEDATA_REG);
3418                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3419                         readl(iommu->reg + DMAR_FEADDR_REG);
3420                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3421                         readl(iommu->reg + DMAR_FEUADDR_REG);
3422
3423                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3424         }
3425         return 0;
3426
3427 nomem:
3428         for_each_active_iommu(iommu, drhd)
3429                 kfree(iommu->iommu_state);
3430
3431         return -ENOMEM;
3432 }
3433
3434 static void iommu_resume(void)
3435 {
3436         struct dmar_drhd_unit *drhd;
3437         struct intel_iommu *iommu = NULL;
3438         unsigned long flag;
3439
3440         if (init_iommu_hw()) {
3441                 if (force_on)
3442                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3443                 else
3444                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3445                 return;
3446         }
3447
3448         for_each_active_iommu(iommu, drhd) {
3449
3450                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3451
3452                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3453                         iommu->reg + DMAR_FECTL_REG);
3454                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3455                         iommu->reg + DMAR_FEDATA_REG);
3456                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3457                         iommu->reg + DMAR_FEADDR_REG);
3458                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3459                         iommu->reg + DMAR_FEUADDR_REG);
3460
3461                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3462         }
3463
3464         for_each_active_iommu(iommu, drhd)
3465                 kfree(iommu->iommu_state);
3466 }
3467
3468 static struct syscore_ops iommu_syscore_ops = {
3469         .resume         = iommu_resume,
3470         .suspend        = iommu_suspend,
3471 };
3472
3473 static void __init init_iommu_pm_ops(void)
3474 {
3475         register_syscore_ops(&iommu_syscore_ops);
3476 }
3477
3478 #else
3479 static inline void init_iommu_pm_ops(void) {}
3480 #endif  /* CONFIG_PM */
3481
3482 LIST_HEAD(dmar_rmrr_units);
3483
3484 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3485 {
3486         list_add(&rmrr->list, &dmar_rmrr_units);
3487 }
3488
3489
3490 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3491 {
3492         struct acpi_dmar_reserved_memory *rmrr;
3493         struct dmar_rmrr_unit *rmrru;
3494
3495         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3496         if (!rmrru)
3497                 return -ENOMEM;
3498
3499         rmrru->hdr = header;
3500         rmrr = (struct acpi_dmar_reserved_memory *)header;
3501         rmrru->base_address = rmrr->base_address;
3502         rmrru->end_address = rmrr->end_address;
3503
3504         dmar_register_rmrr_unit(rmrru);
3505         return 0;
3506 }
3507
3508 static int __init
3509 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3510 {
3511         struct acpi_dmar_reserved_memory *rmrr;
3512         int ret;
3513
3514         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3515         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3516                 ((void *)rmrr) + rmrr->header.length,
3517                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3518
3519         if (ret || (rmrru->devices_cnt == 0)) {
3520                 list_del(&rmrru->list);
3521                 kfree(rmrru);
3522         }
3523         return ret;
3524 }
3525
3526 static LIST_HEAD(dmar_atsr_units);
3527
3528 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3529 {
3530         struct acpi_dmar_atsr *atsr;
3531         struct dmar_atsr_unit *atsru;
3532
3533         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3534         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3535         if (!atsru)
3536                 return -ENOMEM;
3537
3538         atsru->hdr = hdr;
3539         atsru->include_all = atsr->flags & 0x1;
3540
3541         list_add(&atsru->list, &dmar_atsr_units);
3542
3543         return 0;
3544 }
3545
3546 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3547 {
3548         int rc;
3549         struct acpi_dmar_atsr *atsr;
3550
3551         if (atsru->include_all)
3552                 return 0;
3553
3554         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3555         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3556                                 (void *)atsr + atsr->header.length,
3557                                 &atsru->devices_cnt, &atsru->devices,
3558                                 atsr->segment);
3559         if (rc || !atsru->devices_cnt) {
3560                 list_del(&atsru->list);
3561                 kfree(atsru);
3562         }
3563
3564         return rc;
3565 }
3566
3567 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3568 {
3569         int i;
3570         struct pci_bus *bus;
3571         struct acpi_dmar_atsr *atsr;
3572         struct dmar_atsr_unit *atsru;
3573
3574         dev = pci_physfn(dev);
3575
3576         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3577                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3578                 if (atsr->segment == pci_domain_nr(dev->bus))
3579                         goto found;
3580         }
3581
3582         return 0;
3583
3584 found:
3585         for (bus = dev->bus; bus; bus = bus->parent) {
3586                 struct pci_dev *bridge = bus->self;
3587
3588                 if (!bridge || !pci_is_pcie(bridge) ||
3589                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3590                         return 0;
3591
3592                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3593                         for (i = 0; i < atsru->devices_cnt; i++)
3594                                 if (atsru->devices[i] == bridge)
3595                                         return 1;
3596                         break;
3597                 }
3598         }
3599
3600         if (atsru->include_all)
3601                 return 1;
3602
3603         return 0;
3604 }
3605
3606 int __init dmar_parse_rmrr_atsr_dev(void)
3607 {
3608         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3609         struct dmar_atsr_unit *atsr, *atsr_n;
3610         int ret = 0;
3611
3612         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3613                 ret = rmrr_parse_dev(rmrr);
3614                 if (ret)
3615                         return ret;
3616         }
3617
3618         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3619                 ret = atsr_parse_dev(atsr);
3620                 if (ret)
3621                         return ret;
3622         }
3623
3624         return ret;
3625 }
3626
3627 /*
3628  * Here we only respond to action of unbound device from driver.
3629  *
3630  * Added device is not attached to its DMAR domain here yet. That will happen
3631  * when mapping the device to iova.
3632  */
3633 static int device_notifier(struct notifier_block *nb,
3634                                   unsigned long action, void *data)
3635 {
3636         struct device *dev = data;
3637         struct pci_dev *pdev = to_pci_dev(dev);
3638         struct dmar_domain *domain;
3639
3640         if (iommu_no_mapping(dev))
3641                 return 0;
3642
3643         domain = find_domain(pdev);
3644         if (!domain)
3645                 return 0;
3646
3647         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3648                 domain_remove_one_dev_info(domain, pdev);
3649
3650                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3651                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3652                     list_empty(&domain->devices))
3653                         domain_exit(domain);
3654         }
3655
3656         return 0;
3657 }
3658
3659 static struct notifier_block device_nb = {
3660         .notifier_call = device_notifier,
3661 };
3662
3663 int __init intel_iommu_init(void)
3664 {
3665         int ret = 0;
3666
3667         /* VT-d is required for a TXT/tboot launch, so enforce that */
3668         force_on = tboot_force_iommu();
3669
3670         if (dmar_table_init()) {
3671                 if (force_on)
3672                         panic("tboot: Failed to initialize DMAR table\n");
3673                 return  -ENODEV;
3674         }
3675
3676         if (dmar_dev_scope_init() < 0) {
3677                 if (force_on)
3678                         panic("tboot: Failed to initialize DMAR device scope\n");
3679                 return  -ENODEV;
3680         }
3681
3682         if (no_iommu || dmar_disabled)
3683                 return -ENODEV;
3684
3685         if (iommu_init_mempool()) {
3686                 if (force_on)
3687                         panic("tboot: Failed to initialize iommu memory\n");
3688                 return  -ENODEV;
3689         }
3690
3691         if (list_empty(&dmar_rmrr_units))
3692                 printk(KERN_INFO "DMAR: No RMRR found\n");
3693
3694         if (list_empty(&dmar_atsr_units))
3695                 printk(KERN_INFO "DMAR: No ATSR found\n");
3696
3697         if (dmar_init_reserved_ranges()) {
3698                 if (force_on)
3699                         panic("tboot: Failed to reserve iommu ranges\n");
3700                 return  -ENODEV;
3701         }
3702
3703         init_no_remapping_devices();
3704
3705         ret = init_dmars();
3706         if (ret) {
3707                 if (force_on)
3708                         panic("tboot: Failed to initialize DMARs\n");
3709                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3710                 put_iova_domain(&reserved_iova_list);
3711                 iommu_exit_mempool();
3712                 return ret;
3713         }
3714         printk(KERN_INFO
3715         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3716
3717         init_timer(&unmap_timer);
3718 #ifdef CONFIG_SWIOTLB
3719         swiotlb = 0;
3720 #endif
3721         dma_ops = &intel_dma_ops;
3722
3723         init_iommu_pm_ops();
3724
3725         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3726
3727         bus_register_notifier(&pci_bus_type, &device_nb);
3728
3729         intel_iommu_enabled = 1;
3730
3731         return 0;
3732 }
3733
3734 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3735                                            struct pci_dev *pdev)
3736 {
3737         struct pci_dev *tmp, *parent;
3738
3739         if (!iommu || !pdev)
3740                 return;
3741
3742         /* dependent device detach */
3743         tmp = pci_find_upstream_pcie_bridge(pdev);
3744         /* Secondary interface's bus number and devfn 0 */
3745         if (tmp) {
3746                 parent = pdev->bus->self;
3747                 while (parent != tmp) {
3748                         iommu_detach_dev(iommu, parent->bus->number,
3749                                          parent->devfn);
3750                         parent = parent->bus->self;
3751                 }
3752                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3753                         iommu_detach_dev(iommu,
3754                                 tmp->subordinate->number, 0);
3755                 else /* this is a legacy PCI bridge */
3756                         iommu_detach_dev(iommu, tmp->bus->number,
3757                                          tmp->devfn);
3758         }
3759 }
3760
3761 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3762                                           struct pci_dev *pdev)
3763 {
3764         struct device_domain_info *info;
3765         struct intel_iommu *iommu;
3766         unsigned long flags;
3767         int found = 0;
3768         struct list_head *entry, *tmp;
3769
3770         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3771                                 pdev->devfn);
3772         if (!iommu)
3773                 return;
3774
3775         spin_lock_irqsave(&device_domain_lock, flags);
3776         list_for_each_safe(entry, tmp, &domain->devices) {
3777                 info = list_entry(entry, struct device_domain_info, link);
3778                 if (info->segment == pci_domain_nr(pdev->bus) &&
3779                     info->bus == pdev->bus->number &&
3780                     info->devfn == pdev->devfn) {
3781                         unlink_domain_info(info);
3782                         spin_unlock_irqrestore(&device_domain_lock, flags);
3783
3784                         iommu_disable_dev_iotlb(info);
3785                         iommu_detach_dev(iommu, info->bus, info->devfn);
3786                         iommu_detach_dependent_devices(iommu, pdev);
3787                         free_devinfo_mem(info);
3788
3789                         spin_lock_irqsave(&device_domain_lock, flags);
3790
3791                         if (found)
3792                                 break;
3793                         else
3794                                 continue;
3795                 }
3796
3797                 /* if there is no other devices under the same iommu
3798                  * owned by this domain, clear this iommu in iommu_bmp
3799                  * update iommu count and coherency
3800                  */
3801                 if (iommu == device_to_iommu(info->segment, info->bus,
3802                                             info->devfn))
3803                         found = 1;
3804         }
3805
3806         spin_unlock_irqrestore(&device_domain_lock, flags);
3807
3808         if (found == 0) {
3809                 unsigned long tmp_flags;
3810                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3811                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3812                 domain->iommu_count--;
3813                 domain_update_iommu_cap(domain);
3814                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3815
3816                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3818                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3819                         clear_bit(domain->id, iommu->domain_ids);
3820                         iommu->domains[domain->id] = NULL;
3821                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3822                 }
3823         }
3824 }
3825
3826 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3827 {
3828         struct device_domain_info *info;
3829         struct intel_iommu *iommu;
3830         unsigned long flags1, flags2;
3831
3832         spin_lock_irqsave(&device_domain_lock, flags1);
3833         while (!list_empty(&domain->devices)) {
3834                 info = list_entry(domain->devices.next,
3835                         struct device_domain_info, link);
3836                 unlink_domain_info(info);
3837                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3838
3839                 iommu_disable_dev_iotlb(info);
3840                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3841                 iommu_detach_dev(iommu, info->bus, info->devfn);
3842                 iommu_detach_dependent_devices(iommu, info->dev);
3843
3844                 /* clear this iommu in iommu_bmp, update iommu count
3845                  * and capabilities
3846                  */
3847                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3848                 if (test_and_clear_bit(iommu->seq_id,
3849                                        domain->iommu_bmp)) {
3850                         domain->iommu_count--;
3851                         domain_update_iommu_cap(domain);
3852                 }
3853                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3854
3855                 free_devinfo_mem(info);
3856                 spin_lock_irqsave(&device_domain_lock, flags1);
3857         }
3858         spin_unlock_irqrestore(&device_domain_lock, flags1);
3859 }
3860
3861 /* domain id for virtual machine, it won't be set in context */
3862 static unsigned long vm_domid;
3863
3864 static struct dmar_domain *iommu_alloc_vm_domain(void)
3865 {
3866         struct dmar_domain *domain;
3867
3868         domain = alloc_domain_mem();
3869         if (!domain)
3870                 return NULL;
3871
3872         domain->id = vm_domid++;
3873         domain->nid = -1;
3874         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3875         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3876
3877         return domain;
3878 }
3879
3880 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3881 {
3882         int adjust_width;
3883
3884         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3885         spin_lock_init(&domain->iommu_lock);
3886
3887         domain_reserve_special_ranges(domain);
3888
3889         /* calculate AGAW */
3890         domain->gaw = guest_width;
3891         adjust_width = guestwidth_to_adjustwidth(guest_width);
3892         domain->agaw = width_to_agaw(adjust_width);
3893
3894         INIT_LIST_HEAD(&domain->devices);
3895
3896         domain->iommu_count = 0;
3897         domain->iommu_coherency = 0;
3898         domain->iommu_snooping = 0;
3899         domain->iommu_superpage = 0;
3900         domain->max_addr = 0;
3901         domain->nid = -1;
3902
3903         /* always allocate the top pgd */
3904         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3905         if (!domain->pgd)
3906                 return -ENOMEM;
3907         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3908         return 0;
3909 }
3910
3911 static void iommu_free_vm_domain(struct dmar_domain *domain)
3912 {
3913         unsigned long flags;
3914         struct dmar_drhd_unit *drhd;
3915         struct intel_iommu *iommu;
3916         unsigned long i;
3917         unsigned long ndomains;
3918
3919         for_each_drhd_unit(drhd) {
3920                 if (drhd->ignored)
3921                         continue;
3922                 iommu = drhd->iommu;
3923
3924                 ndomains = cap_ndoms(iommu->cap);
3925                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3926                         if (iommu->domains[i] == domain) {
3927                                 spin_lock_irqsave(&iommu->lock, flags);
3928                                 clear_bit(i, iommu->domain_ids);
3929                                 iommu->domains[i] = NULL;
3930                                 spin_unlock_irqrestore(&iommu->lock, flags);
3931                                 break;
3932                         }
3933                 }
3934         }
3935 }
3936
3937 static void vm_domain_exit(struct dmar_domain *domain)
3938 {
3939         /* Domain 0 is reserved, so dont process it */
3940         if (!domain)
3941                 return;
3942
3943         vm_domain_remove_all_dev_info(domain);
3944         /* destroy iovas */
3945         put_iova_domain(&domain->iovad);
3946
3947         /* clear ptes */
3948         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3949
3950         /* free page tables */
3951         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3952
3953         iommu_free_vm_domain(domain);
3954         free_domain_mem(domain);
3955 }
3956
3957 static int intel_iommu_domain_init(struct iommu_domain *domain)
3958 {
3959         struct dmar_domain *dmar_domain;
3960
3961         dmar_domain = iommu_alloc_vm_domain();
3962         if (!dmar_domain) {
3963                 printk(KERN_ERR
3964                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3965                 return -ENOMEM;
3966         }
3967         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3968                 printk(KERN_ERR
3969                         "intel_iommu_domain_init() failed\n");
3970                 vm_domain_exit(dmar_domain);
3971                 return -ENOMEM;
3972         }
3973         domain_update_iommu_cap(dmar_domain);
3974         domain->priv = dmar_domain;
3975
3976         domain->geometry.aperture_start = 0;
3977         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3978         domain->geometry.force_aperture = true;
3979
3980         return 0;
3981 }
3982
3983 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3984 {
3985         struct dmar_domain *dmar_domain = domain->priv;
3986
3987         domain->priv = NULL;
3988         vm_domain_exit(dmar_domain);
3989 }
3990
3991 static int intel_iommu_attach_device(struct iommu_domain *domain,
3992                                      struct device *dev)
3993 {
3994         struct dmar_domain *dmar_domain = domain->priv;
3995         struct pci_dev *pdev = to_pci_dev(dev);
3996         struct intel_iommu *iommu;
3997         int addr_width;
3998
3999         /* normally pdev is not mapped */
4000         if (unlikely(domain_context_mapped(pdev))) {
4001                 struct dmar_domain *old_domain;
4002
4003                 old_domain = find_domain(pdev);
4004                 if (old_domain) {
4005                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4006                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4007                                 domain_remove_one_dev_info(old_domain, pdev);
4008                         else
4009                                 domain_remove_dev_info(old_domain);
4010                 }
4011         }
4012
4013         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4014                                 pdev->devfn);
4015         if (!iommu)
4016                 return -ENODEV;
4017
4018         /* check if this iommu agaw is sufficient for max mapped address */
4019         addr_width = agaw_to_width(iommu->agaw);
4020         if (addr_width > cap_mgaw(iommu->cap))
4021                 addr_width = cap_mgaw(iommu->cap);
4022
4023         if (dmar_domain->max_addr > (1LL << addr_width)) {
4024                 printk(KERN_ERR "%s: iommu width (%d) is not "
4025                        "sufficient for the mapped address (%llx)\n",
4026                        __func__, addr_width, dmar_domain->max_addr);
4027                 return -EFAULT;
4028         }
4029         dmar_domain->gaw = addr_width;
4030
4031         /*
4032          * Knock out extra levels of page tables if necessary
4033          */
4034         while (iommu->agaw < dmar_domain->agaw) {
4035                 struct dma_pte *pte;
4036
4037                 pte = dmar_domain->pgd;
4038                 if (dma_pte_present(pte)) {
4039                         dmar_domain->pgd = (struct dma_pte *)
4040                                 phys_to_virt(dma_pte_addr(pte));
4041                         free_pgtable_page(pte);
4042                 }
4043                 dmar_domain->agaw--;
4044         }
4045
4046         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4047 }
4048
4049 static void intel_iommu_detach_device(struct iommu_domain *domain,
4050                                       struct device *dev)
4051 {
4052         struct dmar_domain *dmar_domain = domain->priv;
4053         struct pci_dev *pdev = to_pci_dev(dev);
4054
4055         domain_remove_one_dev_info(dmar_domain, pdev);
4056 }
4057
4058 static int intel_iommu_map(struct iommu_domain *domain,
4059                            unsigned long iova, phys_addr_t hpa,
4060                            size_t size, int iommu_prot)
4061 {
4062         struct dmar_domain *dmar_domain = domain->priv;
4063         u64 max_addr;
4064         int prot = 0;
4065         int ret;
4066
4067         if (iommu_prot & IOMMU_READ)
4068                 prot |= DMA_PTE_READ;
4069         if (iommu_prot & IOMMU_WRITE)
4070                 prot |= DMA_PTE_WRITE;
4071         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4072                 prot |= DMA_PTE_SNP;
4073
4074         max_addr = iova + size;
4075         if (dmar_domain->max_addr < max_addr) {
4076                 u64 end;
4077
4078                 /* check if minimum agaw is sufficient for mapped address */
4079                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4080                 if (end < max_addr) {
4081                         printk(KERN_ERR "%s: iommu width (%d) is not "
4082                                "sufficient for the mapped address (%llx)\n",
4083                                __func__, dmar_domain->gaw, max_addr);
4084                         return -EFAULT;
4085                 }
4086                 dmar_domain->max_addr = max_addr;
4087         }
4088         /* Round up size to next multiple of PAGE_SIZE, if it and
4089            the low bits of hpa would take us onto the next page */
4090         size = aligned_nrpages(hpa, size);
4091         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4092                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4093         return ret;
4094 }
4095
4096 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4097                              unsigned long iova, size_t size)
4098 {
4099         struct dmar_domain *dmar_domain = domain->priv;
4100         int order;
4101
4102         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4103                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4104
4105         if (dmar_domain->max_addr == iova + size)
4106                 dmar_domain->max_addr = iova;
4107
4108         return PAGE_SIZE << order;
4109 }
4110
4111 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4112                                             unsigned long iova)
4113 {
4114         struct dmar_domain *dmar_domain = domain->priv;
4115         struct dma_pte *pte;
4116         u64 phys = 0;
4117
4118         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4119         if (pte)
4120                 phys = dma_pte_addr(pte);
4121
4122         return phys;
4123 }
4124
4125 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4126                                       unsigned long cap)
4127 {
4128         struct dmar_domain *dmar_domain = domain->priv;
4129
4130         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4131                 return dmar_domain->iommu_snooping;
4132         if (cap == IOMMU_CAP_INTR_REMAP)
4133                 return irq_remapping_enabled;
4134
4135         return 0;
4136 }
4137
4138 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4139 {
4140         pci_dev_put(*from);
4141         *from = to;
4142 }
4143
4144 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4145
4146 static int intel_iommu_add_device(struct device *dev)
4147 {
4148         struct pci_dev *pdev = to_pci_dev(dev);
4149         struct pci_dev *bridge, *dma_pdev = NULL;
4150         struct iommu_group *group;
4151         int ret;
4152
4153         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4154                              pdev->bus->number, pdev->devfn))
4155                 return -ENODEV;
4156
4157         bridge = pci_find_upstream_pcie_bridge(pdev);
4158         if (bridge) {
4159                 if (pci_is_pcie(bridge))
4160                         dma_pdev = pci_get_domain_bus_and_slot(
4161                                                 pci_domain_nr(pdev->bus),
4162                                                 bridge->subordinate->number, 0);
4163                 if (!dma_pdev)
4164                         dma_pdev = pci_dev_get(bridge);
4165         } else
4166                 dma_pdev = pci_dev_get(pdev);
4167
4168         /* Account for quirked devices */
4169         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4170
4171         /*
4172          * If it's a multifunction device that does not support our
4173          * required ACS flags, add to the same group as function 0.
4174          */
4175         if (dma_pdev->multifunction &&
4176             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4177                 swap_pci_ref(&dma_pdev,
4178                              pci_get_slot(dma_pdev->bus,
4179                                           PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4180                                           0)));
4181
4182         /*
4183          * Devices on the root bus go through the iommu.  If that's not us,
4184          * find the next upstream device and test ACS up to the root bus.
4185          * Finding the next device may require skipping virtual buses.
4186          */
4187         while (!pci_is_root_bus(dma_pdev->bus)) {
4188                 struct pci_bus *bus = dma_pdev->bus;
4189
4190                 while (!bus->self) {
4191                         if (!pci_is_root_bus(bus))
4192                                 bus = bus->parent;
4193                         else
4194                                 goto root_bus;
4195                 }
4196
4197                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4198                         break;
4199
4200                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4201         }
4202
4203 root_bus:
4204         group = iommu_group_get(&dma_pdev->dev);
4205         pci_dev_put(dma_pdev);
4206         if (!group) {
4207                 group = iommu_group_alloc();
4208                 if (IS_ERR(group))
4209                         return PTR_ERR(group);
4210         }
4211
4212         ret = iommu_group_add_device(group, dev);
4213
4214         iommu_group_put(group);
4215         return ret;
4216 }
4217
4218 static void intel_iommu_remove_device(struct device *dev)
4219 {
4220         iommu_group_remove_device(dev);
4221 }
4222
4223 static struct iommu_ops intel_iommu_ops = {
4224         .domain_init    = intel_iommu_domain_init,
4225         .domain_destroy = intel_iommu_domain_destroy,
4226         .attach_dev     = intel_iommu_attach_device,
4227         .detach_dev     = intel_iommu_detach_device,
4228         .map            = intel_iommu_map,
4229         .unmap          = intel_iommu_unmap,
4230         .iova_to_phys   = intel_iommu_iova_to_phys,
4231         .domain_has_cap = intel_iommu_domain_has_cap,
4232         .add_device     = intel_iommu_add_device,
4233         .remove_device  = intel_iommu_remove_device,
4234         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4235 };
4236
4237 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4238 {
4239         /*
4240          * Mobile 4 Series Chipset neglects to set RWBF capability,
4241          * but needs it:
4242          */
4243         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4244         rwbf_quirk = 1;
4245
4246         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4247         if (dev->revision == 0x07) {
4248                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4249                 dmar_map_gfx = 0;
4250         }
4251 }
4252
4253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4254
4255 #define GGC 0x52
4256 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4257 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4258 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4259 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4260 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4261 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4262 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4263 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4264
4265 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4266 {
4267         unsigned short ggc;
4268
4269         if (pci_read_config_word(dev, GGC, &ggc))
4270                 return;
4271
4272         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4273                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4274                 dmar_map_gfx = 0;
4275         } else if (dmar_map_gfx) {
4276                 /* we have to ensure the gfx device is idle before we flush */
4277                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4278                 intel_iommu_strict = 1;
4279        }
4280 }
4281 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4282 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4283 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4284 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4285
4286 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4287    ISOCH DMAR unit for the Azalia sound device, but not give it any
4288    TLB entries, which causes it to deadlock. Check for that.  We do
4289    this in a function called from init_dmars(), instead of in a PCI
4290    quirk, because we don't want to print the obnoxious "BIOS broken"
4291    message if VT-d is actually disabled.
4292 */
4293 static void __init check_tylersburg_isoch(void)
4294 {
4295         struct pci_dev *pdev;
4296         uint32_t vtisochctrl;
4297
4298         /* If there's no Azalia in the system anyway, forget it. */
4299         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4300         if (!pdev)
4301                 return;
4302         pci_dev_put(pdev);
4303
4304         /* System Management Registers. Might be hidden, in which case
4305            we can't do the sanity check. But that's OK, because the
4306            known-broken BIOSes _don't_ actually hide it, so far. */
4307         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4308         if (!pdev)
4309                 return;
4310
4311         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4312                 pci_dev_put(pdev);
4313                 return;
4314         }
4315
4316         pci_dev_put(pdev);
4317
4318         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4319         if (vtisochctrl & 1)
4320                 return;
4321
4322         /* Drop all bits other than the number of TLB entries */
4323         vtisochctrl &= 0x1c;
4324
4325         /* If we have the recommended number of TLB entries (16), fine. */
4326         if (vtisochctrl == 0x10)
4327                 return;
4328
4329         /* Zero TLB entries? You get to ride the short bus to school. */
4330         if (!vtisochctrl) {
4331                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4332                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4333                      dmi_get_system_info(DMI_BIOS_VENDOR),
4334                      dmi_get_system_info(DMI_BIOS_VERSION),
4335                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4336                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4337                 return;
4338         }
4339         
4340         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4341                vtisochctrl);
4342 }