2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
46 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
180 * 12-63: Context Ptr (12 - (haw-1))
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val |= value & VTD_PAGE_MASK;
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
204 return (struct context_entry *)
205 (root_present(root)?phys_to_virt(
206 root->val & VTD_PAGE_MASK) :
213 * 1: fault processing disable
214 * 2-3: translation type
215 * 12-63: address space root
221 struct context_entry {
226 static inline bool context_present(struct context_entry *context)
228 return (context->lo & 1);
230 static inline void context_set_present(struct context_entry *context)
235 static inline void context_set_fault_enable(struct context_entry *context)
237 context->lo &= (((u64)-1) << 2) | 1;
240 static inline void context_set_translation_type(struct context_entry *context,
243 context->lo &= (((u64)-1) << 4) | 3;
244 context->lo |= (value & 3) << 2;
247 static inline void context_set_address_root(struct context_entry *context,
250 context->lo |= value & VTD_PAGE_MASK;
253 static inline void context_set_address_width(struct context_entry *context,
256 context->hi |= value & 7;
259 static inline void context_set_domain_id(struct context_entry *context,
262 context->hi |= (value & ((1 << 16) - 1)) << 8;
265 static inline void context_clear_entry(struct context_entry *context)
278 * 12-63: Host physcial address
284 static inline void dma_clear_pte(struct dma_pte *pte)
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 return pte->val & VTD_PAGE_MASK;
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
299 static inline bool dma_pte_present(struct dma_pte *pte)
301 return (pte->val & 3) != 0;
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
306 return (pte->val & (1 << 7));
309 static inline int first_pte_in_page(struct dma_pte *pte)
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
326 /* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
334 /* define the limit of IOMMUs supported in each domain */
336 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
338 # define IOMMU_UNITS_SUPPORTED 64
342 int id; /* domain id */
343 int nid; /* node id */
344 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345 /* bitmap of iommus this domain uses*/
347 struct list_head devices; /* all devices' list */
348 struct iova_domain iovad; /* iova's that belong to this domain */
350 struct dma_pte *pgd; /* virtual address */
351 int gaw; /* max guest address width */
353 /* adjusted guest address width, 0 is level 2 30-bit */
356 int flags; /* flags to find out type of domain */
358 int iommu_coherency;/* indicate coherency of iommu access */
359 int iommu_snooping; /* indicate snooping control feature*/
360 int iommu_count; /* reference count of iommu */
361 int iommu_superpage;/* Level of superpages supported:
362 0 == 4KiB (no superpages), 1 == 2MiB,
363 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364 spinlock_t iommu_lock; /* protect iommu set in domain */
365 u64 max_addr; /* maximum mapped address */
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370 struct list_head link; /* link to domain siblings */
371 struct list_head global; /* link to global list */
372 int segment; /* PCI domain */
373 u8 bus; /* PCI bus number */
374 u8 devfn; /* PCI devfn number */
375 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
376 struct intel_iommu *iommu; /* IOMMU used by this device */
377 struct dmar_domain *domain; /* pointer to domain */
380 struct dmar_rmrr_unit {
381 struct list_head list; /* list of rmrr units */
382 struct acpi_dmar_header *hdr; /* ACPI header */
383 u64 base_address; /* reserved base address*/
384 u64 end_address; /* reserved end address */
385 struct dmar_dev_scope *devices; /* target devices */
386 int devices_cnt; /* target device count */
389 struct dmar_atsr_unit {
390 struct list_head list; /* list of ATSR units */
391 struct acpi_dmar_header *hdr; /* ACPI header */
392 struct dmar_dev_scope *devices; /* target devices */
393 int devices_cnt; /* target device count */
394 u8 include_all:1; /* include all ports */
397 static LIST_HEAD(dmar_atsr_units);
398 static LIST_HEAD(dmar_rmrr_units);
400 #define for_each_rmrr_units(rmrr) \
401 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
403 static void flush_unmaps_timeout(unsigned long data);
405 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
410 struct iova *iova[HIGH_WATER_MARK];
411 struct dmar_domain *domain[HIGH_WATER_MARK];
412 struct page *freelist[HIGH_WATER_MARK];
415 static struct deferred_flush_tables *deferred_flush;
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
424 static long list_size;
426 static void domain_exit(struct dmar_domain *domain);
427 static void domain_remove_dev_info(struct dmar_domain *domain);
428 static void domain_remove_one_dev_info(struct dmar_domain *domain,
429 struct pci_dev *pdev);
430 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
431 struct pci_dev *pdev);
433 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
434 int dmar_disabled = 0;
436 int dmar_disabled = 1;
437 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
439 int intel_iommu_enabled = 0;
440 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
442 static int dmar_map_gfx = 1;
443 static int dmar_forcedac;
444 static int intel_iommu_strict;
445 static int intel_iommu_superpage = 1;
447 int intel_iommu_gfx_mapped;
448 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
450 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
451 static DEFINE_SPINLOCK(device_domain_lock);
452 static LIST_HEAD(device_domain_list);
454 static struct iommu_ops intel_iommu_ops;
456 static int __init intel_iommu_setup(char *str)
461 if (!strncmp(str, "on", 2)) {
463 printk(KERN_INFO "Intel-IOMMU: enabled\n");
464 } else if (!strncmp(str, "off", 3)) {
466 printk(KERN_INFO "Intel-IOMMU: disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
470 "Intel-IOMMU: disable GFX device mapping\n");
471 } else if (!strncmp(str, "forcedac", 8)) {
473 "Intel-IOMMU: Forcing DAC for PCI devices\n");
475 } else if (!strncmp(str, "strict", 6)) {
477 "Intel-IOMMU: disable batched IOTLB flush\n");
478 intel_iommu_strict = 1;
479 } else if (!strncmp(str, "sp_off", 6)) {
481 "Intel-IOMMU: disable supported super page\n");
482 intel_iommu_superpage = 0;
485 str += strcspn(str, ",");
491 __setup("intel_iommu=", intel_iommu_setup);
493 static struct kmem_cache *iommu_domain_cache;
494 static struct kmem_cache *iommu_devinfo_cache;
495 static struct kmem_cache *iommu_iova_cache;
497 static inline void *alloc_pgtable_page(int node)
502 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
504 vaddr = page_address(page);
508 static inline void free_pgtable_page(void *vaddr)
510 free_page((unsigned long)vaddr);
513 static inline void *alloc_domain_mem(void)
515 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
518 static void free_domain_mem(void *vaddr)
520 kmem_cache_free(iommu_domain_cache, vaddr);
523 static inline void * alloc_devinfo_mem(void)
525 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
528 static inline void free_devinfo_mem(void *vaddr)
530 kmem_cache_free(iommu_devinfo_cache, vaddr);
533 struct iova *alloc_iova_mem(void)
535 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
538 void free_iova_mem(struct iova *iova)
540 kmem_cache_free(iommu_iova_cache, iova);
544 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
549 sagaw = cap_sagaw(iommu->cap);
550 for (agaw = width_to_agaw(max_gaw);
552 if (test_bit(agaw, &sagaw))
560 * Calculate max SAGAW for each iommu.
562 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
564 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
568 * calculate agaw for each iommu.
569 * "SAGAW" may be different across iommus, use a default agaw, and
570 * get a supported less agaw for iommus that don't support the default agaw.
572 int iommu_calculate_agaw(struct intel_iommu *iommu)
574 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
577 /* This functionin only returns single iommu in a domain */
578 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
582 /* si_domain and vm domain should not get here. */
583 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
584 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
586 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
587 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
590 return g_iommus[iommu_id];
593 static void domain_update_iommu_coherency(struct dmar_domain *domain)
595 struct dmar_drhd_unit *drhd;
596 struct intel_iommu *iommu;
599 domain->iommu_coherency = 1;
601 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
603 if (!ecap_coherent(g_iommus[i]->ecap)) {
604 domain->iommu_coherency = 0;
611 /* No hardware attached; use lowest common denominator */
613 for_each_active_iommu(iommu, drhd) {
614 if (!ecap_coherent(iommu->ecap)) {
615 domain->iommu_coherency = 0;
622 static void domain_update_iommu_snooping(struct dmar_domain *domain)
626 domain->iommu_snooping = 1;
628 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
629 if (!ecap_sc_support(g_iommus[i]->ecap)) {
630 domain->iommu_snooping = 0;
636 static void domain_update_iommu_superpage(struct dmar_domain *domain)
638 struct dmar_drhd_unit *drhd;
639 struct intel_iommu *iommu = NULL;
642 if (!intel_iommu_superpage) {
643 domain->iommu_superpage = 0;
647 /* set iommu_superpage to the smallest common denominator */
649 for_each_active_iommu(iommu, drhd) {
650 mask &= cap_super_page_val(iommu->cap);
657 domain->iommu_superpage = fls(mask);
660 /* Some capabilities may be different across iommus */
661 static void domain_update_iommu_cap(struct dmar_domain *domain)
663 domain_update_iommu_coherency(domain);
664 domain_update_iommu_snooping(domain);
665 domain_update_iommu_superpage(domain);
668 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
670 struct dmar_drhd_unit *drhd = NULL;
671 struct intel_iommu *iommu;
673 struct pci_dev *pdev;
677 for_each_active_iommu(iommu, drhd) {
678 if (segment != drhd->segment)
681 for_each_active_dev_scope(drhd->devices,
682 drhd->devices_cnt, i, dev) {
683 if (!dev_is_pci(dev))
685 pdev = to_pci_dev(dev);
686 if (pdev->bus->number == bus && pdev->devfn == devfn)
688 if (pdev->subordinate &&
689 pdev->subordinate->number <= bus &&
690 pdev->subordinate->busn_res.end >= bus)
694 if (drhd->include_all)
704 static void domain_flush_cache(struct dmar_domain *domain,
705 void *addr, int size)
707 if (!domain->iommu_coherency)
708 clflush_cache_range(addr, size);
711 /* Gets context entry for a given bus and devfn */
712 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
715 struct root_entry *root;
716 struct context_entry *context;
717 unsigned long phy_addr;
720 spin_lock_irqsave(&iommu->lock, flags);
721 root = &iommu->root_entry[bus];
722 context = get_context_addr_from_root(root);
724 context = (struct context_entry *)
725 alloc_pgtable_page(iommu->node);
727 spin_unlock_irqrestore(&iommu->lock, flags);
730 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
731 phy_addr = virt_to_phys((void *)context);
732 set_root_value(root, phy_addr);
733 set_root_present(root);
734 __iommu_flush_cache(iommu, root, sizeof(*root));
736 spin_unlock_irqrestore(&iommu->lock, flags);
737 return &context[devfn];
740 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
742 struct root_entry *root;
743 struct context_entry *context;
747 spin_lock_irqsave(&iommu->lock, flags);
748 root = &iommu->root_entry[bus];
749 context = get_context_addr_from_root(root);
754 ret = context_present(&context[devfn]);
756 spin_unlock_irqrestore(&iommu->lock, flags);
760 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
762 struct root_entry *root;
763 struct context_entry *context;
766 spin_lock_irqsave(&iommu->lock, flags);
767 root = &iommu->root_entry[bus];
768 context = get_context_addr_from_root(root);
770 context_clear_entry(&context[devfn]);
771 __iommu_flush_cache(iommu, &context[devfn], \
774 spin_unlock_irqrestore(&iommu->lock, flags);
777 static void free_context_table(struct intel_iommu *iommu)
779 struct root_entry *root;
782 struct context_entry *context;
784 spin_lock_irqsave(&iommu->lock, flags);
785 if (!iommu->root_entry) {
788 for (i = 0; i < ROOT_ENTRY_NR; i++) {
789 root = &iommu->root_entry[i];
790 context = get_context_addr_from_root(root);
792 free_pgtable_page(context);
794 free_pgtable_page(iommu->root_entry);
795 iommu->root_entry = NULL;
797 spin_unlock_irqrestore(&iommu->lock, flags);
800 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
801 unsigned long pfn, int *target_level)
803 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
804 struct dma_pte *parent, *pte = NULL;
805 int level = agaw_to_level(domain->agaw);
808 BUG_ON(!domain->pgd);
810 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
811 /* Address beyond IOMMU's addressing capabilities. */
814 parent = domain->pgd;
819 offset = pfn_level_offset(pfn, level);
820 pte = &parent[offset];
821 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
823 if (level == *target_level)
826 if (!dma_pte_present(pte)) {
829 tmp_page = alloc_pgtable_page(domain->nid);
834 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
835 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
836 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
837 /* Someone else set it while we were thinking; use theirs. */
838 free_pgtable_page(tmp_page);
841 domain_flush_cache(domain, pte, sizeof(*pte));
847 parent = phys_to_virt(dma_pte_addr(pte));
852 *target_level = level;
858 /* return address's pte at specific level */
859 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
861 int level, int *large_page)
863 struct dma_pte *parent, *pte = NULL;
864 int total = agaw_to_level(domain->agaw);
867 parent = domain->pgd;
868 while (level <= total) {
869 offset = pfn_level_offset(pfn, total);
870 pte = &parent[offset];
874 if (!dma_pte_present(pte)) {
879 if (pte->val & DMA_PTE_LARGE_PAGE) {
884 parent = phys_to_virt(dma_pte_addr(pte));
890 /* clear last level pte, a tlb flush should be followed */
891 static void dma_pte_clear_range(struct dmar_domain *domain,
892 unsigned long start_pfn,
893 unsigned long last_pfn)
895 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896 unsigned int large_page = 1;
897 struct dma_pte *first_pte, *pte;
899 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901 BUG_ON(start_pfn > last_pfn);
903 /* we don't need lock here; nobody else touches the iova range */
906 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
908 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
913 start_pfn += lvl_to_nr_pages(large_page);
915 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
917 domain_flush_cache(domain, first_pte,
918 (void *)pte - (void *)first_pte);
920 } while (start_pfn && start_pfn <= last_pfn);
923 static void dma_pte_free_level(struct dmar_domain *domain, int level,
924 struct dma_pte *pte, unsigned long pfn,
925 unsigned long start_pfn, unsigned long last_pfn)
927 pfn = max(start_pfn, pfn);
928 pte = &pte[pfn_level_offset(pfn, level)];
931 unsigned long level_pfn;
932 struct dma_pte *level_pte;
934 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
937 level_pfn = pfn & level_mask(level - 1);
938 level_pte = phys_to_virt(dma_pte_addr(pte));
941 dma_pte_free_level(domain, level - 1, level_pte,
942 level_pfn, start_pfn, last_pfn);
944 /* If range covers entire pagetable, free it */
945 if (!(start_pfn > level_pfn ||
946 last_pfn < level_pfn + level_size(level) - 1)) {
948 domain_flush_cache(domain, pte, sizeof(*pte));
949 free_pgtable_page(level_pte);
952 pfn += level_size(level);
953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
956 /* free page table pages. last level pte should already be cleared */
957 static void dma_pte_free_pagetable(struct dmar_domain *domain,
958 unsigned long start_pfn,
959 unsigned long last_pfn)
961 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
963 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
964 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
965 BUG_ON(start_pfn > last_pfn);
967 /* We don't need lock here; nobody else touches the iova range */
968 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
969 domain->pgd, 0, start_pfn, last_pfn);
972 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
973 free_pgtable_page(domain->pgd);
978 /* When a page at a given level is being unlinked from its parent, we don't
979 need to *modify* it at all. All we need to do is make a list of all the
980 pages which can be freed just as soon as we've flushed the IOTLB and we
981 know the hardware page-walk will no longer touch them.
982 The 'pte' argument is the *parent* PTE, pointing to the page that is to
984 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
985 int level, struct dma_pte *pte,
986 struct page *freelist)
990 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
991 pg->freelist = freelist;
997 for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
998 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
999 freelist = dma_pte_list_pagetables(domain, level - 1,
1006 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1007 struct dma_pte *pte, unsigned long pfn,
1008 unsigned long start_pfn,
1009 unsigned long last_pfn,
1010 struct page *freelist)
1012 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1014 pfn = max(start_pfn, pfn);
1015 pte = &pte[pfn_level_offset(pfn, level)];
1018 unsigned long level_pfn;
1020 if (!dma_pte_present(pte))
1023 level_pfn = pfn & level_mask(level);
1025 /* If range covers entire pagetable, free it */
1026 if (start_pfn <= level_pfn &&
1027 last_pfn >= level_pfn + level_size(level) - 1) {
1028 /* These suborbinate page tables are going away entirely. Don't
1029 bother to clear them; we're just going to *free* them. */
1030 if (level > 1 && !dma_pte_superpage(pte))
1031 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1037 } else if (level > 1) {
1038 /* Recurse down into a level that isn't *entirely* obsolete */
1039 freelist = dma_pte_clear_level(domain, level - 1,
1040 phys_to_virt(dma_pte_addr(pte)),
1041 level_pfn, start_pfn, last_pfn,
1045 pfn += level_size(level);
1046 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1049 domain_flush_cache(domain, first_pte,
1050 (void *)++last_pte - (void *)first_pte);
1055 /* We can't just free the pages because the IOMMU may still be walking
1056 the page tables, and may have cached the intermediate levels. The
1057 pages can only be freed after the IOTLB flush has been done. */
1058 struct page *domain_unmap(struct dmar_domain *domain,
1059 unsigned long start_pfn,
1060 unsigned long last_pfn)
1062 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1063 struct page *freelist = NULL;
1065 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1066 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1067 BUG_ON(start_pfn > last_pfn);
1069 /* we don't need lock here; nobody else touches the iova range */
1070 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 domain->pgd, 0, start_pfn, last_pfn, NULL);
1074 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 struct page *pgd_page = virt_to_page(domain->pgd);
1076 pgd_page->freelist = freelist;
1077 freelist = pgd_page;
1085 void dma_free_pagelist(struct page *freelist)
1089 while ((pg = freelist)) {
1090 freelist = pg->freelist;
1091 free_pgtable_page(page_address(pg));
1095 /* iommu handling */
1096 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098 struct root_entry *root;
1099 unsigned long flags;
1101 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1105 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1107 spin_lock_irqsave(&iommu->lock, flags);
1108 iommu->root_entry = root;
1109 spin_unlock_irqrestore(&iommu->lock, flags);
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1120 addr = iommu->root_entry;
1122 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1123 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1125 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1127 /* Make sure hardware complete it */
1128 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1129 readl, (sts & DMA_GSTS_RTPS), sts);
1131 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1139 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1142 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1143 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1145 /* Make sure hardware complete it */
1146 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1147 readl, (!(val & DMA_GSTS_WBFS)), val);
1149 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1152 /* return value determine if we need a write buffer flush */
1153 static void __iommu_flush_context(struct intel_iommu *iommu,
1154 u16 did, u16 source_id, u8 function_mask,
1161 case DMA_CCMD_GLOBAL_INVL:
1162 val = DMA_CCMD_GLOBAL_INVL;
1164 case DMA_CCMD_DOMAIN_INVL:
1165 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1167 case DMA_CCMD_DEVICE_INVL:
1168 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1169 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1174 val |= DMA_CCMD_ICC;
1176 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179 /* Make sure hardware complete it */
1180 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1181 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1188 u64 addr, unsigned int size_order, u64 type)
1190 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 u64 val = 0, val_iva = 0;
1195 case DMA_TLB_GLOBAL_FLUSH:
1196 /* global flush doesn't need set IVA_REG */
1197 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1199 case DMA_TLB_DSI_FLUSH:
1200 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1202 case DMA_TLB_PSI_FLUSH:
1203 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 /* IH bit is passed in as part of address */
1205 val_iva = size_order | addr;
1210 /* Note: set drain read/write */
1213 * This is probably to be super secure.. Looks like we can
1214 * ignore it without any impact.
1216 if (cap_read_drain(iommu->cap))
1217 val |= DMA_TLB_READ_DRAIN;
1219 if (cap_write_drain(iommu->cap))
1220 val |= DMA_TLB_WRITE_DRAIN;
1222 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1223 /* Note: Only uses first TLB reg currently */
1225 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1226 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1228 /* Make sure hardware complete it */
1229 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1230 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1232 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234 /* check IOTLB invalidation granularity */
1235 if (DMA_TLB_IAIG(val) == 0)
1236 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1237 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1238 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1239 (unsigned long long)DMA_TLB_IIRG(type),
1240 (unsigned long long)DMA_TLB_IAIG(val));
1243 static struct device_domain_info *iommu_support_dev_iotlb(
1244 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1247 unsigned long flags;
1248 struct device_domain_info *info;
1249 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1251 if (!ecap_dev_iotlb_support(iommu->ecap))
1257 spin_lock_irqsave(&device_domain_lock, flags);
1258 list_for_each_entry(info, &domain->devices, link)
1259 if (info->bus == bus && info->devfn == devfn) {
1263 spin_unlock_irqrestore(&device_domain_lock, flags);
1265 if (!found || !info->dev)
1268 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1271 if (!dmar_find_matched_atsr_unit(info->dev))
1274 info->iommu = iommu;
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1284 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1289 if (!info->dev || !pci_ats_enabled(info->dev))
1292 pci_disable_ats(info->dev);
1295 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1296 u64 addr, unsigned mask)
1299 unsigned long flags;
1300 struct device_domain_info *info;
1302 spin_lock_irqsave(&device_domain_lock, flags);
1303 list_for_each_entry(info, &domain->devices, link) {
1304 if (!info->dev || !pci_ats_enabled(info->dev))
1307 sid = info->bus << 8 | info->devfn;
1308 qdep = pci_ats_queue_depth(info->dev);
1309 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1311 spin_unlock_irqrestore(&device_domain_lock, flags);
1314 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1315 unsigned long pfn, unsigned int pages, int ih, int map)
1317 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1318 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1325 * Fallback to domain selective flush if no PSI support or the size is
1327 * PSI requires page size to be 2 ^ x, and the base address is naturally
1328 * aligned to the size
1330 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1331 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1334 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1338 * In caching mode, changes of pages from non-present to present require
1339 * flush. However, device IOTLB doesn't need to be flushed in this case.
1341 if (!cap_caching_mode(iommu->cap) || !map)
1342 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1345 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1348 unsigned long flags;
1350 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1351 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1352 pmen &= ~DMA_PMEN_EPM;
1353 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1355 /* wait for the protected region status bit to clear */
1356 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1357 readl, !(pmen & DMA_PMEN_PRS), pmen);
1359 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1362 static int iommu_enable_translation(struct intel_iommu *iommu)
1365 unsigned long flags;
1367 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1368 iommu->gcmd |= DMA_GCMD_TE;
1369 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1371 /* Make sure hardware complete it */
1372 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1373 readl, (sts & DMA_GSTS_TES), sts);
1375 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1379 static int iommu_disable_translation(struct intel_iommu *iommu)
1384 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 iommu->gcmd &= ~DMA_GCMD_TE;
1386 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1388 /* Make sure hardware complete it */
1389 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1390 readl, (!(sts & DMA_GSTS_TES)), sts);
1392 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1397 static int iommu_init_domains(struct intel_iommu *iommu)
1399 unsigned long ndomains;
1400 unsigned long nlongs;
1402 ndomains = cap_ndoms(iommu->cap);
1403 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1404 iommu->seq_id, ndomains);
1405 nlongs = BITS_TO_LONGS(ndomains);
1407 spin_lock_init(&iommu->lock);
1409 /* TBD: there might be 64K domains,
1410 * consider other allocation for future chip
1412 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1413 if (!iommu->domain_ids) {
1414 pr_err("IOMMU%d: allocating domain id array failed\n",
1418 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1420 if (!iommu->domains) {
1421 pr_err("IOMMU%d: allocating domain array failed\n",
1423 kfree(iommu->domain_ids);
1424 iommu->domain_ids = NULL;
1429 * if Caching mode is set, then invalid translations are tagged
1430 * with domainid 0. Hence we need to pre-allocate it.
1432 if (cap_caching_mode(iommu->cap))
1433 set_bit(0, iommu->domain_ids);
1437 static void free_dmar_iommu(struct intel_iommu *iommu)
1439 struct dmar_domain *domain;
1441 unsigned long flags;
1443 if ((iommu->domains) && (iommu->domain_ids)) {
1444 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1446 * Domain id 0 is reserved for invalid translation
1447 * if hardware supports caching mode.
1449 if (cap_caching_mode(iommu->cap) && i == 0)
1452 domain = iommu->domains[i];
1453 clear_bit(i, iommu->domain_ids);
1455 spin_lock_irqsave(&domain->iommu_lock, flags);
1456 count = --domain->iommu_count;
1457 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1459 domain_exit(domain);
1463 if (iommu->gcmd & DMA_GCMD_TE)
1464 iommu_disable_translation(iommu);
1466 kfree(iommu->domains);
1467 kfree(iommu->domain_ids);
1468 iommu->domains = NULL;
1469 iommu->domain_ids = NULL;
1471 g_iommus[iommu->seq_id] = NULL;
1473 /* free context mapping */
1474 free_context_table(iommu);
1477 static struct dmar_domain *alloc_domain(bool vm)
1479 /* domain id for virtual machine, it won't be set in context */
1480 static atomic_t vm_domid = ATOMIC_INIT(0);
1481 struct dmar_domain *domain;
1483 domain = alloc_domain_mem();
1488 domain->iommu_count = 0;
1489 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1491 spin_lock_init(&domain->iommu_lock);
1492 INIT_LIST_HEAD(&domain->devices);
1494 domain->id = atomic_inc_return(&vm_domid);
1495 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1501 static int iommu_attach_domain(struct dmar_domain *domain,
1502 struct intel_iommu *iommu)
1505 unsigned long ndomains;
1506 unsigned long flags;
1508 ndomains = cap_ndoms(iommu->cap);
1510 spin_lock_irqsave(&iommu->lock, flags);
1512 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513 if (num >= ndomains) {
1514 spin_unlock_irqrestore(&iommu->lock, flags);
1515 printk(KERN_ERR "IOMMU: no free domain ids\n");
1520 domain->iommu_count++;
1521 set_bit(num, iommu->domain_ids);
1522 set_bit(iommu->seq_id, domain->iommu_bmp);
1523 iommu->domains[num] = domain;
1524 spin_unlock_irqrestore(&iommu->lock, flags);
1529 static void iommu_detach_domain(struct dmar_domain *domain,
1530 struct intel_iommu *iommu)
1532 unsigned long flags;
1535 spin_lock_irqsave(&iommu->lock, flags);
1536 ndomains = cap_ndoms(iommu->cap);
1537 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1538 if (iommu->domains[num] == domain) {
1539 clear_bit(num, iommu->domain_ids);
1540 iommu->domains[num] = NULL;
1544 spin_unlock_irqrestore(&iommu->lock, flags);
1547 static struct iova_domain reserved_iova_list;
1548 static struct lock_class_key reserved_rbtree_key;
1550 static int dmar_init_reserved_ranges(void)
1552 struct pci_dev *pdev = NULL;
1556 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1558 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1559 &reserved_rbtree_key);
1561 /* IOAPIC ranges shouldn't be accessed by DMA */
1562 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1563 IOVA_PFN(IOAPIC_RANGE_END));
1565 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1569 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1570 for_each_pci_dev(pdev) {
1573 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1574 r = &pdev->resource[i];
1575 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1577 iova = reserve_iova(&reserved_iova_list,
1581 printk(KERN_ERR "Reserve iova failed\n");
1589 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1591 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1594 static inline int guestwidth_to_adjustwidth(int gaw)
1597 int r = (gaw - 12) % 9;
1608 static int domain_init(struct dmar_domain *domain, int guest_width)
1610 struct intel_iommu *iommu;
1611 int adjust_width, agaw;
1612 unsigned long sagaw;
1614 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1615 domain_reserve_special_ranges(domain);
1617 /* calculate AGAW */
1618 iommu = domain_get_iommu(domain);
1619 if (guest_width > cap_mgaw(iommu->cap))
1620 guest_width = cap_mgaw(iommu->cap);
1621 domain->gaw = guest_width;
1622 adjust_width = guestwidth_to_adjustwidth(guest_width);
1623 agaw = width_to_agaw(adjust_width);
1624 sagaw = cap_sagaw(iommu->cap);
1625 if (!test_bit(agaw, &sagaw)) {
1626 /* hardware doesn't support it, choose a bigger one */
1627 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1628 agaw = find_next_bit(&sagaw, 5, agaw);
1632 domain->agaw = agaw;
1634 if (ecap_coherent(iommu->ecap))
1635 domain->iommu_coherency = 1;
1637 domain->iommu_coherency = 0;
1639 if (ecap_sc_support(iommu->ecap))
1640 domain->iommu_snooping = 1;
1642 domain->iommu_snooping = 0;
1644 if (intel_iommu_superpage)
1645 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1647 domain->iommu_superpage = 0;
1649 domain->nid = iommu->node;
1651 /* always allocate the top pgd */
1652 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1655 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1659 static void domain_exit(struct dmar_domain *domain)
1661 struct dmar_drhd_unit *drhd;
1662 struct intel_iommu *iommu;
1663 struct page *freelist = NULL;
1665 /* Domain 0 is reserved, so dont process it */
1669 /* Flush any lazy unmaps that may reference this domain */
1670 if (!intel_iommu_strict)
1671 flush_unmaps_timeout(0);
1673 /* remove associated devices */
1674 domain_remove_dev_info(domain);
1677 put_iova_domain(&domain->iovad);
1679 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1681 /* clear attached or cached domains */
1683 for_each_active_iommu(iommu, drhd)
1684 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1685 test_bit(iommu->seq_id, domain->iommu_bmp))
1686 iommu_detach_domain(domain, iommu);
1689 dma_free_pagelist(freelist);
1691 free_domain_mem(domain);
1694 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1695 u8 bus, u8 devfn, int translation)
1697 struct context_entry *context;
1698 unsigned long flags;
1699 struct intel_iommu *iommu;
1700 struct dma_pte *pgd;
1702 unsigned long ndomains;
1705 struct device_domain_info *info = NULL;
1707 pr_debug("Set context mapping for %02x:%02x.%d\n",
1708 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1710 BUG_ON(!domain->pgd);
1711 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1712 translation != CONTEXT_TT_MULTI_LEVEL);
1714 iommu = device_to_iommu(segment, bus, devfn);
1718 context = device_to_context_entry(iommu, bus, devfn);
1721 spin_lock_irqsave(&iommu->lock, flags);
1722 if (context_present(context)) {
1723 spin_unlock_irqrestore(&iommu->lock, flags);
1730 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1731 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1734 /* find an available domain id for this device in iommu */
1735 ndomains = cap_ndoms(iommu->cap);
1736 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1737 if (iommu->domains[num] == domain) {
1745 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1746 if (num >= ndomains) {
1747 spin_unlock_irqrestore(&iommu->lock, flags);
1748 printk(KERN_ERR "IOMMU: no free domain ids\n");
1752 set_bit(num, iommu->domain_ids);
1753 iommu->domains[num] = domain;
1757 /* Skip top levels of page tables for
1758 * iommu which has less agaw than default.
1759 * Unnecessary for PT mode.
1761 if (translation != CONTEXT_TT_PASS_THROUGH) {
1762 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1763 pgd = phys_to_virt(dma_pte_addr(pgd));
1764 if (!dma_pte_present(pgd)) {
1765 spin_unlock_irqrestore(&iommu->lock, flags);
1772 context_set_domain_id(context, id);
1774 if (translation != CONTEXT_TT_PASS_THROUGH) {
1775 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1776 translation = info ? CONTEXT_TT_DEV_IOTLB :
1777 CONTEXT_TT_MULTI_LEVEL;
1780 * In pass through mode, AW must be programmed to indicate the largest
1781 * AGAW value supported by hardware. And ASR is ignored by hardware.
1783 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1784 context_set_address_width(context, iommu->msagaw);
1786 context_set_address_root(context, virt_to_phys(pgd));
1787 context_set_address_width(context, iommu->agaw);
1790 context_set_translation_type(context, translation);
1791 context_set_fault_enable(context);
1792 context_set_present(context);
1793 domain_flush_cache(domain, context, sizeof(*context));
1796 * It's a non-present to present mapping. If hardware doesn't cache
1797 * non-present entry we only need to flush the write-buffer. If the
1798 * _does_ cache non-present entries, then it does so in the special
1799 * domain #0, which we have to flush:
1801 if (cap_caching_mode(iommu->cap)) {
1802 iommu->flush.flush_context(iommu, 0,
1803 (((u16)bus) << 8) | devfn,
1804 DMA_CCMD_MASK_NOBIT,
1805 DMA_CCMD_DEVICE_INVL);
1806 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1808 iommu_flush_write_buffer(iommu);
1810 iommu_enable_dev_iotlb(info);
1811 spin_unlock_irqrestore(&iommu->lock, flags);
1813 spin_lock_irqsave(&domain->iommu_lock, flags);
1814 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1815 domain->iommu_count++;
1816 if (domain->iommu_count == 1)
1817 domain->nid = iommu->node;
1818 domain_update_iommu_cap(domain);
1820 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1825 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1829 struct pci_dev *tmp, *parent;
1831 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1832 pdev->bus->number, pdev->devfn,
1837 /* dependent device mapping */
1838 tmp = pci_find_upstream_pcie_bridge(pdev);
1841 /* Secondary interface's bus number and devfn 0 */
1842 parent = pdev->bus->self;
1843 while (parent != tmp) {
1844 ret = domain_context_mapping_one(domain,
1845 pci_domain_nr(parent->bus),
1846 parent->bus->number,
1847 parent->devfn, translation);
1850 parent = parent->bus->self;
1852 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1853 return domain_context_mapping_one(domain,
1854 pci_domain_nr(tmp->subordinate),
1855 tmp->subordinate->number, 0,
1857 else /* this is a legacy PCI bridge */
1858 return domain_context_mapping_one(domain,
1859 pci_domain_nr(tmp->bus),
1865 static int domain_context_mapped(struct pci_dev *pdev)
1868 struct pci_dev *tmp, *parent;
1869 struct intel_iommu *iommu;
1871 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1876 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1879 /* dependent device mapping */
1880 tmp = pci_find_upstream_pcie_bridge(pdev);
1883 /* Secondary interface's bus number and devfn 0 */
1884 parent = pdev->bus->self;
1885 while (parent != tmp) {
1886 ret = device_context_mapped(iommu, parent->bus->number,
1890 parent = parent->bus->self;
1892 if (pci_is_pcie(tmp))
1893 return device_context_mapped(iommu, tmp->subordinate->number,
1896 return device_context_mapped(iommu, tmp->bus->number,
1900 /* Returns a number of VTD pages, but aligned to MM page size */
1901 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1904 host_addr &= ~PAGE_MASK;
1905 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1908 /* Return largest possible superpage level for a given mapping */
1909 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1910 unsigned long iov_pfn,
1911 unsigned long phy_pfn,
1912 unsigned long pages)
1914 int support, level = 1;
1915 unsigned long pfnmerge;
1917 support = domain->iommu_superpage;
1919 /* To use a large page, the virtual *and* physical addresses
1920 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1921 of them will mean we have to use smaller pages. So just
1922 merge them and check both at once. */
1923 pfnmerge = iov_pfn | phy_pfn;
1925 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1926 pages >>= VTD_STRIDE_SHIFT;
1929 pfnmerge >>= VTD_STRIDE_SHIFT;
1936 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1937 struct scatterlist *sg, unsigned long phys_pfn,
1938 unsigned long nr_pages, int prot)
1940 struct dma_pte *first_pte = NULL, *pte = NULL;
1941 phys_addr_t uninitialized_var(pteval);
1942 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1943 unsigned long sg_res;
1944 unsigned int largepage_lvl = 0;
1945 unsigned long lvl_pages = 0;
1947 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1949 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1952 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1957 sg_res = nr_pages + 1;
1958 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1961 while (nr_pages > 0) {
1965 sg_res = aligned_nrpages(sg->offset, sg->length);
1966 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1967 sg->dma_length = sg->length;
1968 pteval = page_to_phys(sg_page(sg)) | prot;
1969 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1973 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1975 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1978 /* It is large page*/
1979 if (largepage_lvl > 1) {
1980 pteval |= DMA_PTE_LARGE_PAGE;
1981 /* Ensure that old small page tables are removed to make room
1982 for superpage, if they exist. */
1983 dma_pte_clear_range(domain, iov_pfn,
1984 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1985 dma_pte_free_pagetable(domain, iov_pfn,
1986 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1988 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1992 /* We don't need lock here, nobody else
1993 * touches the iova range
1995 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1997 static int dumps = 5;
1998 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1999 iov_pfn, tmp, (unsigned long long)pteval);
2002 debug_dma_dump_mappings(NULL);
2007 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2009 BUG_ON(nr_pages < lvl_pages);
2010 BUG_ON(sg_res < lvl_pages);
2012 nr_pages -= lvl_pages;
2013 iov_pfn += lvl_pages;
2014 phys_pfn += lvl_pages;
2015 pteval += lvl_pages * VTD_PAGE_SIZE;
2016 sg_res -= lvl_pages;
2018 /* If the next PTE would be the first in a new page, then we
2019 need to flush the cache on the entries we've just written.
2020 And then we'll need to recalculate 'pte', so clear it and
2021 let it get set again in the if (!pte) block above.
2023 If we're done (!nr_pages) we need to flush the cache too.
2025 Also if we've been setting superpages, we may need to
2026 recalculate 'pte' and switch back to smaller pages for the
2027 end of the mapping, if the trailing size is not enough to
2028 use another superpage (i.e. sg_res < lvl_pages). */
2030 if (!nr_pages || first_pte_in_page(pte) ||
2031 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2032 domain_flush_cache(domain, first_pte,
2033 (void *)pte - (void *)first_pte);
2037 if (!sg_res && nr_pages)
2043 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2044 struct scatterlist *sg, unsigned long nr_pages,
2047 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2050 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051 unsigned long phys_pfn, unsigned long nr_pages,
2054 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2057 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2062 clear_context_table(iommu, bus, devfn);
2063 iommu->flush.flush_context(iommu, 0, 0, 0,
2064 DMA_CCMD_GLOBAL_INVL);
2065 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2068 static inline void unlink_domain_info(struct device_domain_info *info)
2070 assert_spin_locked(&device_domain_lock);
2071 list_del(&info->link);
2072 list_del(&info->global);
2074 info->dev->dev.archdata.iommu = NULL;
2077 static void domain_remove_dev_info(struct dmar_domain *domain)
2079 struct device_domain_info *info;
2080 unsigned long flags, flags2;
2081 struct intel_iommu *iommu;
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 while (!list_empty(&domain->devices)) {
2085 info = list_entry(domain->devices.next,
2086 struct device_domain_info, link);
2087 unlink_domain_info(info);
2088 spin_unlock_irqrestore(&device_domain_lock, flags);
2090 iommu_disable_dev_iotlb(info);
2091 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2092 iommu_detach_dev(iommu, info->bus, info->devfn);
2094 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2095 iommu_detach_dependent_devices(iommu, info->dev);
2096 /* clear this iommu in iommu_bmp, update iommu count
2099 spin_lock_irqsave(&domain->iommu_lock, flags2);
2100 if (test_and_clear_bit(iommu->seq_id,
2101 domain->iommu_bmp)) {
2102 domain->iommu_count--;
2103 domain_update_iommu_cap(domain);
2105 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2108 free_devinfo_mem(info);
2109 spin_lock_irqsave(&device_domain_lock, flags);
2111 spin_unlock_irqrestore(&device_domain_lock, flags);
2116 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
2118 static struct dmar_domain *
2119 find_domain(struct pci_dev *pdev)
2121 struct device_domain_info *info;
2123 /* No lock here, assumes no domain exit in normal case */
2124 info = pdev->dev.archdata.iommu;
2126 return info->domain;
2130 static inline struct dmar_domain *
2131 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2133 struct device_domain_info *info;
2135 list_for_each_entry(info, &device_domain_list, global)
2136 if (info->segment == segment && info->bus == bus &&
2137 info->devfn == devfn)
2138 return info->domain;
2143 static int dmar_insert_dev_info(int segment, int bus, int devfn,
2144 struct pci_dev *dev, struct dmar_domain **domp)
2146 struct dmar_domain *found, *domain = *domp;
2147 struct device_domain_info *info;
2148 unsigned long flags;
2150 info = alloc_devinfo_mem();
2154 info->segment = segment;
2156 info->devfn = devfn;
2158 info->domain = domain;
2160 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2162 spin_lock_irqsave(&device_domain_lock, flags);
2164 found = find_domain(dev);
2166 found = dmar_search_domain_by_dev_info(segment, bus, devfn);
2168 spin_unlock_irqrestore(&device_domain_lock, flags);
2169 free_devinfo_mem(info);
2170 if (found != domain) {
2171 domain_exit(domain);
2175 list_add(&info->link, &domain->devices);
2176 list_add(&info->global, &device_domain_list);
2178 dev->dev.archdata.iommu = info;
2179 spin_unlock_irqrestore(&device_domain_lock, flags);
2185 /* domain is initialized */
2186 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2188 struct dmar_domain *domain, *free = NULL;
2189 struct intel_iommu *iommu;
2190 struct dmar_drhd_unit *drhd;
2191 struct pci_dev *dev_tmp;
2192 unsigned long flags;
2193 int bus = 0, devfn = 0;
2196 domain = find_domain(pdev);
2200 segment = pci_domain_nr(pdev->bus);
2202 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2204 if (pci_is_pcie(dev_tmp)) {
2205 bus = dev_tmp->subordinate->number;
2208 bus = dev_tmp->bus->number;
2209 devfn = dev_tmp->devfn;
2211 spin_lock_irqsave(&device_domain_lock, flags);
2212 domain = dmar_search_domain_by_dev_info(segment, bus, devfn);
2213 spin_unlock_irqrestore(&device_domain_lock, flags);
2214 /* pcie-pci bridge already has a domain, uses it */
2219 drhd = dmar_find_matched_drhd_unit(pdev);
2221 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2225 iommu = drhd->iommu;
2227 /* Allocate and intialize new domain for the device */
2228 domain = alloc_domain(false);
2231 if (iommu_attach_domain(domain, iommu)) {
2232 free_domain_mem(domain);
2236 if (domain_init(domain, gaw))
2239 /* register pcie-to-pci device */
2241 if (dmar_insert_dev_info(segment, bus, devfn, NULL, &domain))
2248 if (dmar_insert_dev_info(segment, pdev->bus->number, pdev->devfn,
2249 pdev, &domain) == 0)
2254 /* recheck it here, maybe others set it */
2255 return find_domain(pdev);
2258 static int iommu_identity_mapping;
2259 #define IDENTMAP_ALL 1
2260 #define IDENTMAP_GFX 2
2261 #define IDENTMAP_AZALIA 4
2263 static int iommu_domain_identity_map(struct dmar_domain *domain,
2264 unsigned long long start,
2265 unsigned long long end)
2267 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2268 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2270 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2271 dma_to_mm_pfn(last_vpfn))) {
2272 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2276 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2277 start, end, domain->id);
2279 * RMRR range might have overlap with physical memory range,
2282 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2284 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2285 last_vpfn - first_vpfn + 1,
2286 DMA_PTE_READ|DMA_PTE_WRITE);
2289 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2290 unsigned long long start,
2291 unsigned long long end)
2293 struct dmar_domain *domain;
2296 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2300 /* For _hardware_ passthrough, don't bother. But for software
2301 passthrough, we do it anyway -- it may indicate a memory
2302 range which is reserved in E820, so which didn't get set
2303 up to start with in si_domain */
2304 if (domain == si_domain && hw_pass_through) {
2305 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2306 pci_name(pdev), start, end);
2311 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2312 pci_name(pdev), start, end);
2315 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2316 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2317 dmi_get_system_info(DMI_BIOS_VENDOR),
2318 dmi_get_system_info(DMI_BIOS_VERSION),
2319 dmi_get_system_info(DMI_PRODUCT_VERSION));
2324 if (end >> agaw_to_width(domain->agaw)) {
2325 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2326 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2327 agaw_to_width(domain->agaw),
2328 dmi_get_system_info(DMI_BIOS_VENDOR),
2329 dmi_get_system_info(DMI_BIOS_VERSION),
2330 dmi_get_system_info(DMI_PRODUCT_VERSION));
2335 ret = iommu_domain_identity_map(domain, start, end);
2339 /* context entry init */
2340 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2347 domain_exit(domain);
2351 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2352 struct pci_dev *pdev)
2354 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2356 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2360 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2361 static inline void iommu_prepare_isa(void)
2363 struct pci_dev *pdev;
2366 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2370 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2371 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2374 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2375 "floppy might not work\n");
2379 static inline void iommu_prepare_isa(void)
2383 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2385 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2387 static int __init si_domain_init(int hw)
2389 struct dmar_drhd_unit *drhd;
2390 struct intel_iommu *iommu;
2393 si_domain = alloc_domain(false);
2397 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2399 for_each_active_iommu(iommu, drhd) {
2400 ret = iommu_attach_domain(si_domain, iommu);
2402 domain_exit(si_domain);
2407 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2408 domain_exit(si_domain);
2412 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2418 for_each_online_node(nid) {
2419 unsigned long start_pfn, end_pfn;
2422 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2423 ret = iommu_domain_identity_map(si_domain,
2424 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2433 static int identity_mapping(struct pci_dev *pdev)
2435 struct device_domain_info *info;
2437 if (likely(!iommu_identity_mapping))
2440 info = pdev->dev.archdata.iommu;
2441 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2442 return (info->domain == si_domain);
2447 static int domain_add_dev_info(struct dmar_domain *domain,
2448 struct pci_dev *pdev,
2451 struct device_domain_info *info;
2452 unsigned long flags;
2455 info = alloc_devinfo_mem();
2459 info->segment = pci_domain_nr(pdev->bus);
2460 info->bus = pdev->bus->number;
2461 info->devfn = pdev->devfn;
2463 info->domain = domain;
2465 spin_lock_irqsave(&device_domain_lock, flags);
2466 list_add(&info->link, &domain->devices);
2467 list_add(&info->global, &device_domain_list);
2468 pdev->dev.archdata.iommu = info;
2469 spin_unlock_irqrestore(&device_domain_lock, flags);
2471 ret = domain_context_mapping(domain, pdev, translation);
2473 spin_lock_irqsave(&device_domain_lock, flags);
2474 unlink_domain_info(info);
2475 spin_unlock_irqrestore(&device_domain_lock, flags);
2476 free_devinfo_mem(info);
2483 static bool device_has_rmrr(struct pci_dev *dev)
2485 struct dmar_rmrr_unit *rmrr;
2490 for_each_rmrr_units(rmrr) {
2492 * Return TRUE if this RMRR contains the device that
2495 for_each_active_dev_scope(rmrr->devices,
2496 rmrr->devices_cnt, i, tmp)
2497 if (tmp == &dev->dev) {
2506 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2510 * We want to prevent any device associated with an RMRR from
2511 * getting placed into the SI Domain. This is done because
2512 * problems exist when devices are moved in and out of domains
2513 * and their respective RMRR info is lost. We exempt USB devices
2514 * from this process due to their usage of RMRRs that are known
2515 * to not be needed after BIOS hand-off to OS.
2517 if (device_has_rmrr(pdev) &&
2518 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2521 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2524 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2527 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2531 * We want to start off with all devices in the 1:1 domain, and
2532 * take them out later if we find they can't access all of memory.
2534 * However, we can't do this for PCI devices behind bridges,
2535 * because all PCI devices behind the same bridge will end up
2536 * with the same source-id on their transactions.
2538 * Practically speaking, we can't change things around for these
2539 * devices at run-time, because we can't be sure there'll be no
2540 * DMA transactions in flight for any of their siblings.
2542 * So PCI devices (unless they're on the root bus) as well as
2543 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2544 * the 1:1 domain, just in _case_ one of their siblings turns out
2545 * not to be able to map all of memory.
2547 if (!pci_is_pcie(pdev)) {
2548 if (!pci_is_root_bus(pdev->bus))
2550 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2552 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2556 * At boot time, we don't yet know if devices will be 64-bit capable.
2557 * Assume that they will -- if they turn out not to be, then we can
2558 * take them out of the 1:1 domain later.
2562 * If the device's dma_mask is less than the system's memory
2563 * size then this is not a candidate for identity mapping.
2565 u64 dma_mask = pdev->dma_mask;
2567 if (pdev->dev.coherent_dma_mask &&
2568 pdev->dev.coherent_dma_mask < dma_mask)
2569 dma_mask = pdev->dev.coherent_dma_mask;
2571 return dma_mask >= dma_get_required_mask(&pdev->dev);
2577 static int __init iommu_prepare_static_identity_mapping(int hw)
2579 struct pci_dev *pdev = NULL;
2582 ret = si_domain_init(hw);
2586 for_each_pci_dev(pdev) {
2587 if (iommu_should_identity_map(pdev, 1)) {
2588 ret = domain_add_dev_info(si_domain, pdev,
2589 hw ? CONTEXT_TT_PASS_THROUGH :
2590 CONTEXT_TT_MULTI_LEVEL);
2592 /* device not associated with an iommu */
2597 pr_info("IOMMU: %s identity mapping for device %s\n",
2598 hw ? "hardware" : "software", pci_name(pdev));
2605 static int __init init_dmars(void)
2607 struct dmar_drhd_unit *drhd;
2608 struct dmar_rmrr_unit *rmrr;
2610 struct intel_iommu *iommu;
2616 * initialize and program root entry to not present
2619 for_each_drhd_unit(drhd) {
2621 * lock not needed as this is only incremented in the single
2622 * threaded kernel __init code path all other access are read
2625 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2629 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2630 IOMMU_UNITS_SUPPORTED);
2633 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2636 printk(KERN_ERR "Allocating global iommu array failed\n");
2641 deferred_flush = kzalloc(g_num_of_iommus *
2642 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2643 if (!deferred_flush) {
2648 for_each_active_iommu(iommu, drhd) {
2649 g_iommus[iommu->seq_id] = iommu;
2651 ret = iommu_init_domains(iommu);
2657 * we could share the same root & context tables
2658 * among all IOMMU's. Need to Split it later.
2660 ret = iommu_alloc_root_entry(iommu);
2662 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2665 if (!ecap_pass_through(iommu->ecap))
2666 hw_pass_through = 0;
2670 * Start from the sane iommu hardware state.
2672 for_each_active_iommu(iommu, drhd) {
2674 * If the queued invalidation is already initialized by us
2675 * (for example, while enabling interrupt-remapping) then
2676 * we got the things already rolling from a sane state.
2682 * Clear any previous faults.
2684 dmar_fault(-1, iommu);
2686 * Disable queued invalidation if supported and already enabled
2687 * before OS handover.
2689 dmar_disable_qi(iommu);
2692 for_each_active_iommu(iommu, drhd) {
2693 if (dmar_enable_qi(iommu)) {
2695 * Queued Invalidate not enabled, use Register Based
2698 iommu->flush.flush_context = __iommu_flush_context;
2699 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2700 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2703 (unsigned long long)drhd->reg_base_addr);
2705 iommu->flush.flush_context = qi_flush_context;
2706 iommu->flush.flush_iotlb = qi_flush_iotlb;
2707 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2710 (unsigned long long)drhd->reg_base_addr);
2714 if (iommu_pass_through)
2715 iommu_identity_mapping |= IDENTMAP_ALL;
2717 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2718 iommu_identity_mapping |= IDENTMAP_GFX;
2721 check_tylersburg_isoch();
2724 * If pass through is not set or not enabled, setup context entries for
2725 * identity mappings for rmrr, gfx, and isa and may fall back to static
2726 * identity mapping if iommu_identity_mapping is set.
2728 if (iommu_identity_mapping) {
2729 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2731 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2737 * for each dev attached to rmrr
2739 * locate drhd for dev, alloc domain for dev
2740 * allocate free domain
2741 * allocate page table entries for rmrr
2742 * if context not allocated for bus
2743 * allocate and init context
2744 * set present in root table for this bus
2745 * init context with domain, translation etc
2749 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2750 for_each_rmrr_units(rmrr) {
2751 /* some BIOS lists non-exist devices in DMAR table. */
2752 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2754 if (!dev_is_pci(dev))
2756 ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2759 "IOMMU: mapping reserved region failed\n");
2763 iommu_prepare_isa();
2768 * global invalidate context cache
2769 * global invalidate iotlb
2770 * enable translation
2772 for_each_iommu(iommu, drhd) {
2773 if (drhd->ignored) {
2775 * we always have to disable PMRs or DMA may fail on
2779 iommu_disable_protect_mem_regions(iommu);
2783 iommu_flush_write_buffer(iommu);
2785 ret = dmar_set_interrupt(iommu);
2789 iommu_set_root_entry(iommu);
2791 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2792 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2794 ret = iommu_enable_translation(iommu);
2798 iommu_disable_protect_mem_regions(iommu);
2804 for_each_active_iommu(iommu, drhd)
2805 free_dmar_iommu(iommu);
2806 kfree(deferred_flush);
2813 /* This takes a number of _MM_ pages, not VTD pages */
2814 static struct iova *intel_alloc_iova(struct device *dev,
2815 struct dmar_domain *domain,
2816 unsigned long nrpages, uint64_t dma_mask)
2818 struct pci_dev *pdev = to_pci_dev(dev);
2819 struct iova *iova = NULL;
2821 /* Restrict dma_mask to the width that the iommu can handle */
2822 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2824 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2826 * First try to allocate an io virtual address in
2827 * DMA_BIT_MASK(32) and if that fails then try allocating
2830 iova = alloc_iova(&domain->iovad, nrpages,
2831 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2835 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2836 if (unlikely(!iova)) {
2837 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2838 nrpages, pci_name(pdev));
2845 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2847 struct dmar_domain *domain;
2850 domain = get_domain_for_dev(pdev,
2851 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2854 "Allocating domain for %s failed", pci_name(pdev));
2858 /* make sure context mapping is ok */
2859 if (unlikely(!domain_context_mapped(pdev))) {
2860 ret = domain_context_mapping(domain, pdev,
2861 CONTEXT_TT_MULTI_LEVEL);
2864 "Domain context map for %s failed",
2873 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2875 struct device_domain_info *info;
2877 /* No lock here, assumes no domain exit in normal case */
2878 info = dev->dev.archdata.iommu;
2880 return info->domain;
2882 return __get_valid_domain_for_dev(dev);
2885 static int iommu_dummy(struct device *dev)
2887 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2890 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2891 static int iommu_no_mapping(struct device *dev)
2893 struct pci_dev *pdev;
2896 if (unlikely(!dev_is_pci(dev)))
2899 if (iommu_dummy(dev))
2902 if (!iommu_identity_mapping)
2905 pdev = to_pci_dev(dev);
2906 found = identity_mapping(pdev);
2908 if (iommu_should_identity_map(pdev, 0))
2912 * 32 bit DMA is removed from si_domain and fall back
2913 * to non-identity mapping.
2915 domain_remove_one_dev_info(si_domain, pdev);
2916 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2922 * In case of a detached 64 bit DMA device from vm, the device
2923 * is put into si_domain for identity mapping.
2925 if (iommu_should_identity_map(pdev, 0)) {
2927 ret = domain_add_dev_info(si_domain, pdev,
2929 CONTEXT_TT_PASS_THROUGH :
2930 CONTEXT_TT_MULTI_LEVEL);
2932 printk(KERN_INFO "64bit %s uses identity mapping\n",
2942 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2943 size_t size, int dir, u64 dma_mask)
2945 struct pci_dev *pdev = to_pci_dev(hwdev);
2946 struct dmar_domain *domain;
2947 phys_addr_t start_paddr;
2951 struct intel_iommu *iommu;
2952 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2954 BUG_ON(dir == DMA_NONE);
2956 if (iommu_no_mapping(hwdev))
2959 domain = get_valid_domain_for_dev(pdev);
2963 iommu = domain_get_iommu(domain);
2964 size = aligned_nrpages(paddr, size);
2966 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2971 * Check if DMAR supports zero-length reads on write only
2974 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2975 !cap_zlr(iommu->cap))
2976 prot |= DMA_PTE_READ;
2977 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2978 prot |= DMA_PTE_WRITE;
2980 * paddr - (paddr + size) might be partial page, we should map the whole
2981 * page. Note: if two part of one page are separately mapped, we
2982 * might have two guest_addr mapping to the same host paddr, but this
2983 * is not a big problem
2985 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2986 mm_to_dma_pfn(paddr_pfn), size, prot);
2990 /* it's a non-present to present mapping. Only flush if caching mode */
2991 if (cap_caching_mode(iommu->cap))
2992 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2994 iommu_flush_write_buffer(iommu);
2996 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2997 start_paddr += paddr & ~PAGE_MASK;
3002 __free_iova(&domain->iovad, iova);
3003 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3004 pci_name(pdev), size, (unsigned long long)paddr, dir);
3008 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3009 unsigned long offset, size_t size,
3010 enum dma_data_direction dir,
3011 struct dma_attrs *attrs)
3013 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3014 dir, to_pci_dev(dev)->dma_mask);
3017 static void flush_unmaps(void)
3023 /* just flush them all */
3024 for (i = 0; i < g_num_of_iommus; i++) {
3025 struct intel_iommu *iommu = g_iommus[i];
3029 if (!deferred_flush[i].next)
3032 /* In caching mode, global flushes turn emulation expensive */
3033 if (!cap_caching_mode(iommu->cap))
3034 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3035 DMA_TLB_GLOBAL_FLUSH);
3036 for (j = 0; j < deferred_flush[i].next; j++) {
3038 struct iova *iova = deferred_flush[i].iova[j];
3039 struct dmar_domain *domain = deferred_flush[i].domain[j];
3041 /* On real hardware multiple invalidations are expensive */
3042 if (cap_caching_mode(iommu->cap))
3043 iommu_flush_iotlb_psi(iommu, domain->id,
3044 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3045 !deferred_flush[i].freelist[j], 0);
3047 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3048 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3049 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3051 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3052 if (deferred_flush[i].freelist[j])
3053 dma_free_pagelist(deferred_flush[i].freelist[j]);
3055 deferred_flush[i].next = 0;
3061 static void flush_unmaps_timeout(unsigned long data)
3063 unsigned long flags;
3065 spin_lock_irqsave(&async_umap_flush_lock, flags);
3067 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3070 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3072 unsigned long flags;
3074 struct intel_iommu *iommu;
3076 spin_lock_irqsave(&async_umap_flush_lock, flags);
3077 if (list_size == HIGH_WATER_MARK)
3080 iommu = domain_get_iommu(dom);
3081 iommu_id = iommu->seq_id;
3083 next = deferred_flush[iommu_id].next;
3084 deferred_flush[iommu_id].domain[next] = dom;
3085 deferred_flush[iommu_id].iova[next] = iova;
3086 deferred_flush[iommu_id].freelist[next] = freelist;
3087 deferred_flush[iommu_id].next++;
3090 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3094 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3097 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3098 size_t size, enum dma_data_direction dir,
3099 struct dma_attrs *attrs)
3101 struct pci_dev *pdev = to_pci_dev(dev);
3102 struct dmar_domain *domain;
3103 unsigned long start_pfn, last_pfn;
3105 struct intel_iommu *iommu;
3106 struct page *freelist;
3108 if (iommu_no_mapping(dev))
3111 domain = find_domain(pdev);
3114 iommu = domain_get_iommu(domain);
3116 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3117 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3118 (unsigned long long)dev_addr))
3121 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3122 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3124 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3125 pci_name(pdev), start_pfn, last_pfn);
3127 freelist = domain_unmap(domain, start_pfn, last_pfn);
3129 if (intel_iommu_strict) {
3130 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3131 last_pfn - start_pfn + 1, !freelist, 0);
3133 __free_iova(&domain->iovad, iova);
3134 dma_free_pagelist(freelist);
3136 add_unmap(domain, iova, freelist);
3138 * queue up the release of the unmap to save the 1/6th of the
3139 * cpu used up by the iotlb flush operation...
3144 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3145 dma_addr_t *dma_handle, gfp_t flags,
3146 struct dma_attrs *attrs)
3151 size = PAGE_ALIGN(size);
3152 order = get_order(size);
3154 if (!iommu_no_mapping(hwdev))
3155 flags &= ~(GFP_DMA | GFP_DMA32);
3156 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3157 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3163 vaddr = (void *)__get_free_pages(flags, order);
3166 memset(vaddr, 0, size);
3168 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3170 hwdev->coherent_dma_mask);
3173 free_pages((unsigned long)vaddr, order);
3177 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3178 dma_addr_t dma_handle, struct dma_attrs *attrs)
3182 size = PAGE_ALIGN(size);
3183 order = get_order(size);
3185 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3186 free_pages((unsigned long)vaddr, order);
3189 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3190 int nelems, enum dma_data_direction dir,
3191 struct dma_attrs *attrs)
3193 struct pci_dev *pdev = to_pci_dev(hwdev);
3194 struct dmar_domain *domain;
3195 unsigned long start_pfn, last_pfn;
3197 struct intel_iommu *iommu;
3198 struct page *freelist;
3200 if (iommu_no_mapping(hwdev))
3203 domain = find_domain(pdev);
3206 iommu = domain_get_iommu(domain);
3208 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3209 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3210 (unsigned long long)sglist[0].dma_address))
3213 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3214 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3216 freelist = domain_unmap(domain, start_pfn, last_pfn);
3218 if (intel_iommu_strict) {
3219 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3220 last_pfn - start_pfn + 1, !freelist, 0);
3222 __free_iova(&domain->iovad, iova);
3223 dma_free_pagelist(freelist);
3225 add_unmap(domain, iova, freelist);
3227 * queue up the release of the unmap to save the 1/6th of the
3228 * cpu used up by the iotlb flush operation...
3233 static int intel_nontranslate_map_sg(struct device *hddev,
3234 struct scatterlist *sglist, int nelems, int dir)
3237 struct scatterlist *sg;
3239 for_each_sg(sglist, sg, nelems, i) {
3240 BUG_ON(!sg_page(sg));
3241 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3242 sg->dma_length = sg->length;
3247 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3248 enum dma_data_direction dir, struct dma_attrs *attrs)
3251 struct pci_dev *pdev = to_pci_dev(hwdev);
3252 struct dmar_domain *domain;
3255 struct iova *iova = NULL;
3257 struct scatterlist *sg;
3258 unsigned long start_vpfn;
3259 struct intel_iommu *iommu;
3261 BUG_ON(dir == DMA_NONE);
3262 if (iommu_no_mapping(hwdev))
3263 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3265 domain = get_valid_domain_for_dev(pdev);
3269 iommu = domain_get_iommu(domain);
3271 for_each_sg(sglist, sg, nelems, i)
3272 size += aligned_nrpages(sg->offset, sg->length);
3274 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3277 sglist->dma_length = 0;
3282 * Check if DMAR supports zero-length reads on write only
3285 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3286 !cap_zlr(iommu->cap))
3287 prot |= DMA_PTE_READ;
3288 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3289 prot |= DMA_PTE_WRITE;
3291 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3293 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3294 if (unlikely(ret)) {
3295 /* clear the page */
3296 dma_pte_clear_range(domain, start_vpfn,
3297 start_vpfn + size - 1);
3298 /* free page tables */
3299 dma_pte_free_pagetable(domain, start_vpfn,
3300 start_vpfn + size - 1);
3302 __free_iova(&domain->iovad, iova);
3306 /* it's a non-present to present mapping. Only flush if caching mode */
3307 if (cap_caching_mode(iommu->cap))
3308 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3310 iommu_flush_write_buffer(iommu);
3315 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3320 struct dma_map_ops intel_dma_ops = {
3321 .alloc = intel_alloc_coherent,
3322 .free = intel_free_coherent,
3323 .map_sg = intel_map_sg,
3324 .unmap_sg = intel_unmap_sg,
3325 .map_page = intel_map_page,
3326 .unmap_page = intel_unmap_page,
3327 .mapping_error = intel_mapping_error,
3330 static inline int iommu_domain_cache_init(void)
3334 iommu_domain_cache = kmem_cache_create("iommu_domain",
3335 sizeof(struct dmar_domain),
3340 if (!iommu_domain_cache) {
3341 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3348 static inline int iommu_devinfo_cache_init(void)
3352 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3353 sizeof(struct device_domain_info),
3357 if (!iommu_devinfo_cache) {
3358 printk(KERN_ERR "Couldn't create devinfo cache\n");
3365 static inline int iommu_iova_cache_init(void)
3369 iommu_iova_cache = kmem_cache_create("iommu_iova",
3370 sizeof(struct iova),
3374 if (!iommu_iova_cache) {
3375 printk(KERN_ERR "Couldn't create iova cache\n");
3382 static int __init iommu_init_mempool(void)
3385 ret = iommu_iova_cache_init();
3389 ret = iommu_domain_cache_init();
3393 ret = iommu_devinfo_cache_init();
3397 kmem_cache_destroy(iommu_domain_cache);
3399 kmem_cache_destroy(iommu_iova_cache);
3404 static void __init iommu_exit_mempool(void)
3406 kmem_cache_destroy(iommu_devinfo_cache);
3407 kmem_cache_destroy(iommu_domain_cache);
3408 kmem_cache_destroy(iommu_iova_cache);
3412 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3414 struct dmar_drhd_unit *drhd;
3418 /* We know that this device on this chipset has its own IOMMU.
3419 * If we find it under a different IOMMU, then the BIOS is lying
3420 * to us. Hope that the IOMMU for this device is actually
3421 * disabled, and it needs no translation...
3423 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3425 /* "can't" happen */
3426 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3429 vtbar &= 0xffff0000;
3431 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3432 drhd = dmar_find_matched_drhd_unit(pdev);
3433 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3434 TAINT_FIRMWARE_WORKAROUND,
3435 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3436 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3438 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3440 static void __init init_no_remapping_devices(void)
3442 struct dmar_drhd_unit *drhd;
3446 for_each_drhd_unit(drhd) {
3447 if (!drhd->include_all) {
3448 for_each_active_dev_scope(drhd->devices,
3449 drhd->devices_cnt, i, dev)
3451 /* ignore DMAR unit if no devices exist */
3452 if (i == drhd->devices_cnt)
3457 for_each_active_drhd_unit(drhd) {
3458 if (drhd->include_all)
3461 for_each_active_dev_scope(drhd->devices,
3462 drhd->devices_cnt, i, dev)
3463 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3465 if (i < drhd->devices_cnt)
3468 /* This IOMMU has *only* gfx devices. Either bypass it or
3469 set the gfx_mapped flag, as appropriate */
3471 intel_iommu_gfx_mapped = 1;
3474 for_each_active_dev_scope(drhd->devices,
3475 drhd->devices_cnt, i, dev)
3476 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3481 #ifdef CONFIG_SUSPEND
3482 static int init_iommu_hw(void)
3484 struct dmar_drhd_unit *drhd;
3485 struct intel_iommu *iommu = NULL;
3487 for_each_active_iommu(iommu, drhd)
3489 dmar_reenable_qi(iommu);
3491 for_each_iommu(iommu, drhd) {
3492 if (drhd->ignored) {
3494 * we always have to disable PMRs or DMA may fail on
3498 iommu_disable_protect_mem_regions(iommu);
3502 iommu_flush_write_buffer(iommu);
3504 iommu_set_root_entry(iommu);
3506 iommu->flush.flush_context(iommu, 0, 0, 0,
3507 DMA_CCMD_GLOBAL_INVL);
3508 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3509 DMA_TLB_GLOBAL_FLUSH);
3510 if (iommu_enable_translation(iommu))
3512 iommu_disable_protect_mem_regions(iommu);
3518 static void iommu_flush_all(void)
3520 struct dmar_drhd_unit *drhd;
3521 struct intel_iommu *iommu;
3523 for_each_active_iommu(iommu, drhd) {
3524 iommu->flush.flush_context(iommu, 0, 0, 0,
3525 DMA_CCMD_GLOBAL_INVL);
3526 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3527 DMA_TLB_GLOBAL_FLUSH);
3531 static int iommu_suspend(void)
3533 struct dmar_drhd_unit *drhd;
3534 struct intel_iommu *iommu = NULL;
3537 for_each_active_iommu(iommu, drhd) {
3538 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3540 if (!iommu->iommu_state)
3546 for_each_active_iommu(iommu, drhd) {
3547 iommu_disable_translation(iommu);
3549 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3551 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3552 readl(iommu->reg + DMAR_FECTL_REG);
3553 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3554 readl(iommu->reg + DMAR_FEDATA_REG);
3555 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3556 readl(iommu->reg + DMAR_FEADDR_REG);
3557 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3558 readl(iommu->reg + DMAR_FEUADDR_REG);
3560 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3565 for_each_active_iommu(iommu, drhd)
3566 kfree(iommu->iommu_state);
3571 static void iommu_resume(void)
3573 struct dmar_drhd_unit *drhd;
3574 struct intel_iommu *iommu = NULL;
3577 if (init_iommu_hw()) {
3579 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3581 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3585 for_each_active_iommu(iommu, drhd) {
3587 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3589 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3590 iommu->reg + DMAR_FECTL_REG);
3591 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3592 iommu->reg + DMAR_FEDATA_REG);
3593 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3594 iommu->reg + DMAR_FEADDR_REG);
3595 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3596 iommu->reg + DMAR_FEUADDR_REG);
3598 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3601 for_each_active_iommu(iommu, drhd)
3602 kfree(iommu->iommu_state);
3605 static struct syscore_ops iommu_syscore_ops = {
3606 .resume = iommu_resume,
3607 .suspend = iommu_suspend,
3610 static void __init init_iommu_pm_ops(void)
3612 register_syscore_ops(&iommu_syscore_ops);
3616 static inline void init_iommu_pm_ops(void) {}
3617 #endif /* CONFIG_PM */
3620 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3622 struct acpi_dmar_reserved_memory *rmrr;
3623 struct dmar_rmrr_unit *rmrru;
3625 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3629 rmrru->hdr = header;
3630 rmrr = (struct acpi_dmar_reserved_memory *)header;
3631 rmrru->base_address = rmrr->base_address;
3632 rmrru->end_address = rmrr->end_address;
3633 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3634 ((void *)rmrr) + rmrr->header.length,
3635 &rmrru->devices_cnt);
3636 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3641 list_add(&rmrru->list, &dmar_rmrr_units);
3646 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3648 struct acpi_dmar_atsr *atsr;
3649 struct dmar_atsr_unit *atsru;
3651 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3652 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3657 atsru->include_all = atsr->flags & 0x1;
3658 if (!atsru->include_all) {
3659 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3660 (void *)atsr + atsr->header.length,
3661 &atsru->devices_cnt);
3662 if (atsru->devices_cnt && atsru->devices == NULL) {
3668 list_add_rcu(&atsru->list, &dmar_atsr_units);
3673 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3675 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3679 static void intel_iommu_free_dmars(void)
3681 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3682 struct dmar_atsr_unit *atsru, *atsr_n;
3684 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3685 list_del(&rmrru->list);
3686 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3690 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3691 list_del(&atsru->list);
3692 intel_iommu_free_atsr(atsru);
3696 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3699 struct pci_bus *bus;
3700 struct pci_dev *bridge = NULL;
3702 struct acpi_dmar_atsr *atsr;
3703 struct dmar_atsr_unit *atsru;
3705 dev = pci_physfn(dev);
3706 for (bus = dev->bus; bus; bus = bus->parent) {
3708 if (!bridge || !pci_is_pcie(bridge) ||
3709 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3711 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3718 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3719 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3720 if (atsr->segment != pci_domain_nr(dev->bus))
3723 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3724 if (tmp == &bridge->dev)
3727 if (atsru->include_all)
3737 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3740 struct dmar_rmrr_unit *rmrru;
3741 struct dmar_atsr_unit *atsru;
3742 struct acpi_dmar_atsr *atsr;
3743 struct acpi_dmar_reserved_memory *rmrr;
3745 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3748 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3749 rmrr = container_of(rmrru->hdr,
3750 struct acpi_dmar_reserved_memory, header);
3751 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3752 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3753 ((void *)rmrr) + rmrr->header.length,
3754 rmrr->segment, rmrru->devices,
3755 rmrru->devices_cnt);
3760 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3761 if (dmar_remove_dev_scope(info, rmrr->segment,
3762 rmrru->devices, rmrru->devices_cnt))
3767 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3768 if (atsru->include_all)
3771 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3772 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3773 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3774 (void *)atsr + atsr->header.length,
3775 atsr->segment, atsru->devices,
3776 atsru->devices_cnt);
3781 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3782 if (dmar_remove_dev_scope(info, atsr->segment,
3783 atsru->devices, atsru->devices_cnt))
3792 * Here we only respond to action of unbound device from driver.
3794 * Added device is not attached to its DMAR domain here yet. That will happen
3795 * when mapping the device to iova.
3797 static int device_notifier(struct notifier_block *nb,
3798 unsigned long action, void *data)
3800 struct device *dev = data;
3801 struct pci_dev *pdev = to_pci_dev(dev);
3802 struct dmar_domain *domain;
3804 if (iommu_dummy(dev))
3807 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3808 action != BUS_NOTIFY_DEL_DEVICE)
3811 domain = find_domain(pdev);
3815 down_read(&dmar_global_lock);
3816 domain_remove_one_dev_info(domain, pdev);
3817 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3818 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3819 list_empty(&domain->devices))
3820 domain_exit(domain);
3821 up_read(&dmar_global_lock);
3826 static struct notifier_block device_nb = {
3827 .notifier_call = device_notifier,
3830 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3831 unsigned long val, void *v)
3833 struct memory_notify *mhp = v;
3834 unsigned long long start, end;
3835 unsigned long start_vpfn, last_vpfn;
3838 case MEM_GOING_ONLINE:
3839 start = mhp->start_pfn << PAGE_SHIFT;
3840 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3841 if (iommu_domain_identity_map(si_domain, start, end)) {
3842 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3849 case MEM_CANCEL_ONLINE:
3850 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3851 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3852 while (start_vpfn <= last_vpfn) {
3854 struct dmar_drhd_unit *drhd;
3855 struct intel_iommu *iommu;
3856 struct page *freelist;
3858 iova = find_iova(&si_domain->iovad, start_vpfn);
3860 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3865 iova = split_and_remove_iova(&si_domain->iovad, iova,
3866 start_vpfn, last_vpfn);
3868 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3869 start_vpfn, last_vpfn);
3873 freelist = domain_unmap(si_domain, iova->pfn_lo,
3877 for_each_active_iommu(iommu, drhd)
3878 iommu_flush_iotlb_psi(iommu, si_domain->id,
3880 iova->pfn_hi - iova->pfn_lo + 1,
3883 dma_free_pagelist(freelist);
3885 start_vpfn = iova->pfn_hi + 1;
3886 free_iova_mem(iova);
3894 static struct notifier_block intel_iommu_memory_nb = {
3895 .notifier_call = intel_iommu_memory_notifier,
3899 int __init intel_iommu_init(void)
3902 struct dmar_drhd_unit *drhd;
3903 struct intel_iommu *iommu;
3905 /* VT-d is required for a TXT/tboot launch, so enforce that */
3906 force_on = tboot_force_iommu();
3908 if (iommu_init_mempool()) {
3910 panic("tboot: Failed to initialize iommu memory\n");
3914 down_write(&dmar_global_lock);
3915 if (dmar_table_init()) {
3917 panic("tboot: Failed to initialize DMAR table\n");
3922 * Disable translation if already enabled prior to OS handover.
3924 for_each_active_iommu(iommu, drhd)
3925 if (iommu->gcmd & DMA_GCMD_TE)
3926 iommu_disable_translation(iommu);
3928 if (dmar_dev_scope_init() < 0) {
3930 panic("tboot: Failed to initialize DMAR device scope\n");
3934 if (no_iommu || dmar_disabled)
3937 if (list_empty(&dmar_rmrr_units))
3938 printk(KERN_INFO "DMAR: No RMRR found\n");
3940 if (list_empty(&dmar_atsr_units))
3941 printk(KERN_INFO "DMAR: No ATSR found\n");
3943 if (dmar_init_reserved_ranges()) {
3945 panic("tboot: Failed to reserve iommu ranges\n");
3946 goto out_free_reserved_range;
3949 init_no_remapping_devices();
3954 panic("tboot: Failed to initialize DMARs\n");
3955 printk(KERN_ERR "IOMMU: dmar init failed\n");
3956 goto out_free_reserved_range;
3958 up_write(&dmar_global_lock);
3960 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3962 init_timer(&unmap_timer);
3963 #ifdef CONFIG_SWIOTLB
3966 dma_ops = &intel_dma_ops;
3968 init_iommu_pm_ops();
3970 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3971 bus_register_notifier(&pci_bus_type, &device_nb);
3972 if (si_domain && !hw_pass_through)
3973 register_memory_notifier(&intel_iommu_memory_nb);
3975 intel_iommu_enabled = 1;
3979 out_free_reserved_range:
3980 put_iova_domain(&reserved_iova_list);
3982 intel_iommu_free_dmars();
3983 up_write(&dmar_global_lock);
3984 iommu_exit_mempool();
3988 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3989 struct pci_dev *pdev)
3991 struct pci_dev *tmp, *parent;
3993 if (!iommu || !pdev)
3996 /* dependent device detach */
3997 tmp = pci_find_upstream_pcie_bridge(pdev);
3998 /* Secondary interface's bus number and devfn 0 */
4000 parent = pdev->bus->self;
4001 while (parent != tmp) {
4002 iommu_detach_dev(iommu, parent->bus->number,
4004 parent = parent->bus->self;
4006 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4007 iommu_detach_dev(iommu,
4008 tmp->subordinate->number, 0);
4009 else /* this is a legacy PCI bridge */
4010 iommu_detach_dev(iommu, tmp->bus->number,
4015 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4016 struct pci_dev *pdev)
4018 struct device_domain_info *info, *tmp;
4019 struct intel_iommu *iommu;
4020 unsigned long flags;
4023 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4028 spin_lock_irqsave(&device_domain_lock, flags);
4029 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4030 if (info->segment == pci_domain_nr(pdev->bus) &&
4031 info->bus == pdev->bus->number &&
4032 info->devfn == pdev->devfn) {
4033 unlink_domain_info(info);
4034 spin_unlock_irqrestore(&device_domain_lock, flags);
4036 iommu_disable_dev_iotlb(info);
4037 iommu_detach_dev(iommu, info->bus, info->devfn);
4038 iommu_detach_dependent_devices(iommu, pdev);
4039 free_devinfo_mem(info);
4041 spin_lock_irqsave(&device_domain_lock, flags);
4049 /* if there is no other devices under the same iommu
4050 * owned by this domain, clear this iommu in iommu_bmp
4051 * update iommu count and coherency
4053 if (iommu == device_to_iommu(info->segment, info->bus,
4058 spin_unlock_irqrestore(&device_domain_lock, flags);
4061 unsigned long tmp_flags;
4062 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4063 clear_bit(iommu->seq_id, domain->iommu_bmp);
4064 domain->iommu_count--;
4065 domain_update_iommu_cap(domain);
4066 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4068 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4069 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4070 spin_lock_irqsave(&iommu->lock, tmp_flags);
4071 clear_bit(domain->id, iommu->domain_ids);
4072 iommu->domains[domain->id] = NULL;
4073 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4078 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4082 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4083 domain_reserve_special_ranges(domain);
4085 /* calculate AGAW */
4086 domain->gaw = guest_width;
4087 adjust_width = guestwidth_to_adjustwidth(guest_width);
4088 domain->agaw = width_to_agaw(adjust_width);
4090 domain->iommu_coherency = 0;
4091 domain->iommu_snooping = 0;
4092 domain->iommu_superpage = 0;
4093 domain->max_addr = 0;
4096 /* always allocate the top pgd */
4097 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4100 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4104 static int intel_iommu_domain_init(struct iommu_domain *domain)
4106 struct dmar_domain *dmar_domain;
4108 dmar_domain = alloc_domain(true);
4111 "intel_iommu_domain_init: dmar_domain == NULL\n");
4114 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4116 "intel_iommu_domain_init() failed\n");
4117 domain_exit(dmar_domain);
4120 domain_update_iommu_cap(dmar_domain);
4121 domain->priv = dmar_domain;
4123 domain->geometry.aperture_start = 0;
4124 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4125 domain->geometry.force_aperture = true;
4130 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4132 struct dmar_domain *dmar_domain = domain->priv;
4134 domain->priv = NULL;
4135 domain_exit(dmar_domain);
4138 static int intel_iommu_attach_device(struct iommu_domain *domain,
4141 struct dmar_domain *dmar_domain = domain->priv;
4142 struct pci_dev *pdev = to_pci_dev(dev);
4143 struct intel_iommu *iommu;
4146 /* normally pdev is not mapped */
4147 if (unlikely(domain_context_mapped(pdev))) {
4148 struct dmar_domain *old_domain;
4150 old_domain = find_domain(pdev);
4152 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4153 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4154 domain_remove_one_dev_info(old_domain, pdev);
4156 domain_remove_dev_info(old_domain);
4160 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4165 /* check if this iommu agaw is sufficient for max mapped address */
4166 addr_width = agaw_to_width(iommu->agaw);
4167 if (addr_width > cap_mgaw(iommu->cap))
4168 addr_width = cap_mgaw(iommu->cap);
4170 if (dmar_domain->max_addr > (1LL << addr_width)) {
4171 printk(KERN_ERR "%s: iommu width (%d) is not "
4172 "sufficient for the mapped address (%llx)\n",
4173 __func__, addr_width, dmar_domain->max_addr);
4176 dmar_domain->gaw = addr_width;
4179 * Knock out extra levels of page tables if necessary
4181 while (iommu->agaw < dmar_domain->agaw) {
4182 struct dma_pte *pte;
4184 pte = dmar_domain->pgd;
4185 if (dma_pte_present(pte)) {
4186 dmar_domain->pgd = (struct dma_pte *)
4187 phys_to_virt(dma_pte_addr(pte));
4188 free_pgtable_page(pte);
4190 dmar_domain->agaw--;
4193 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4196 static void intel_iommu_detach_device(struct iommu_domain *domain,
4199 struct dmar_domain *dmar_domain = domain->priv;
4200 struct pci_dev *pdev = to_pci_dev(dev);
4202 domain_remove_one_dev_info(dmar_domain, pdev);
4205 static int intel_iommu_map(struct iommu_domain *domain,
4206 unsigned long iova, phys_addr_t hpa,
4207 size_t size, int iommu_prot)
4209 struct dmar_domain *dmar_domain = domain->priv;
4214 if (iommu_prot & IOMMU_READ)
4215 prot |= DMA_PTE_READ;
4216 if (iommu_prot & IOMMU_WRITE)
4217 prot |= DMA_PTE_WRITE;
4218 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4219 prot |= DMA_PTE_SNP;
4221 max_addr = iova + size;
4222 if (dmar_domain->max_addr < max_addr) {
4225 /* check if minimum agaw is sufficient for mapped address */
4226 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4227 if (end < max_addr) {
4228 printk(KERN_ERR "%s: iommu width (%d) is not "
4229 "sufficient for the mapped address (%llx)\n",
4230 __func__, dmar_domain->gaw, max_addr);
4233 dmar_domain->max_addr = max_addr;
4235 /* Round up size to next multiple of PAGE_SIZE, if it and
4236 the low bits of hpa would take us onto the next page */
4237 size = aligned_nrpages(hpa, size);
4238 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4239 hpa >> VTD_PAGE_SHIFT, size, prot);
4243 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4244 unsigned long iova, size_t size)
4246 struct dmar_domain *dmar_domain = domain->priv;
4247 struct page *freelist = NULL;
4248 struct intel_iommu *iommu;
4249 unsigned long start_pfn, last_pfn;
4250 unsigned int npages;
4251 int iommu_id, num, ndomains, level = 0;
4253 /* Cope with horrid API which requires us to unmap more than the
4254 size argument if it happens to be a large-page mapping. */
4255 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4258 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4259 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4261 start_pfn = iova >> VTD_PAGE_SHIFT;
4262 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4264 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4266 npages = last_pfn - start_pfn + 1;
4268 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4269 iommu = g_iommus[iommu_id];
4272 * find bit position of dmar_domain
4274 ndomains = cap_ndoms(iommu->cap);
4275 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4276 if (iommu->domains[num] == dmar_domain)
4277 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4278 npages, !freelist, 0);
4283 dma_free_pagelist(freelist);
4285 if (dmar_domain->max_addr == iova + size)
4286 dmar_domain->max_addr = iova;
4291 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4294 struct dmar_domain *dmar_domain = domain->priv;
4295 struct dma_pte *pte;
4299 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4301 phys = dma_pte_addr(pte);
4306 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4309 struct dmar_domain *dmar_domain = domain->priv;
4311 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4312 return dmar_domain->iommu_snooping;
4313 if (cap == IOMMU_CAP_INTR_REMAP)
4314 return irq_remapping_enabled;
4319 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4321 static int intel_iommu_add_device(struct device *dev)
4323 struct pci_dev *pdev = to_pci_dev(dev);
4324 struct pci_dev *bridge, *dma_pdev = NULL;
4325 struct iommu_group *group;
4328 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4329 pdev->bus->number, pdev->devfn))
4332 bridge = pci_find_upstream_pcie_bridge(pdev);
4334 if (pci_is_pcie(bridge))
4335 dma_pdev = pci_get_domain_bus_and_slot(
4336 pci_domain_nr(pdev->bus),
4337 bridge->subordinate->number, 0);
4339 dma_pdev = pci_dev_get(bridge);
4341 dma_pdev = pci_dev_get(pdev);
4343 /* Account for quirked devices */
4344 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4347 * If it's a multifunction device that does not support our
4348 * required ACS flags, add to the same group as lowest numbered
4349 * function that also does not suport the required ACS flags.
4351 if (dma_pdev->multifunction &&
4352 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4353 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4355 for (i = 0; i < 8; i++) {
4356 struct pci_dev *tmp;
4358 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4362 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4363 swap_pci_ref(&dma_pdev, tmp);
4371 * Devices on the root bus go through the iommu. If that's not us,
4372 * find the next upstream device and test ACS up to the root bus.
4373 * Finding the next device may require skipping virtual buses.
4375 while (!pci_is_root_bus(dma_pdev->bus)) {
4376 struct pci_bus *bus = dma_pdev->bus;
4378 while (!bus->self) {
4379 if (!pci_is_root_bus(bus))
4385 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4388 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4392 group = iommu_group_get(&dma_pdev->dev);
4393 pci_dev_put(dma_pdev);
4395 group = iommu_group_alloc();
4397 return PTR_ERR(group);
4400 ret = iommu_group_add_device(group, dev);
4402 iommu_group_put(group);
4406 static void intel_iommu_remove_device(struct device *dev)
4408 iommu_group_remove_device(dev);
4411 static struct iommu_ops intel_iommu_ops = {
4412 .domain_init = intel_iommu_domain_init,
4413 .domain_destroy = intel_iommu_domain_destroy,
4414 .attach_dev = intel_iommu_attach_device,
4415 .detach_dev = intel_iommu_detach_device,
4416 .map = intel_iommu_map,
4417 .unmap = intel_iommu_unmap,
4418 .iova_to_phys = intel_iommu_iova_to_phys,
4419 .domain_has_cap = intel_iommu_domain_has_cap,
4420 .add_device = intel_iommu_add_device,
4421 .remove_device = intel_iommu_remove_device,
4422 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4425 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4427 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4428 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4440 static void quirk_iommu_rwbf(struct pci_dev *dev)
4443 * Mobile 4 Series Chipset neglects to set RWBF capability,
4444 * but needs it. Same seems to hold for the desktop versions.
4446 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4459 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4460 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4461 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4462 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4463 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4464 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4465 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4466 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4468 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4472 if (pci_read_config_word(dev, GGC, &ggc))
4475 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4476 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4478 } else if (dmar_map_gfx) {
4479 /* we have to ensure the gfx device is idle before we flush */
4480 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4481 intel_iommu_strict = 1;
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4489 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4490 ISOCH DMAR unit for the Azalia sound device, but not give it any
4491 TLB entries, which causes it to deadlock. Check for that. We do
4492 this in a function called from init_dmars(), instead of in a PCI
4493 quirk, because we don't want to print the obnoxious "BIOS broken"
4494 message if VT-d is actually disabled.
4496 static void __init check_tylersburg_isoch(void)
4498 struct pci_dev *pdev;
4499 uint32_t vtisochctrl;
4501 /* If there's no Azalia in the system anyway, forget it. */
4502 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4507 /* System Management Registers. Might be hidden, in which case
4508 we can't do the sanity check. But that's OK, because the
4509 known-broken BIOSes _don't_ actually hide it, so far. */
4510 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4514 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4521 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4522 if (vtisochctrl & 1)
4525 /* Drop all bits other than the number of TLB entries */
4526 vtisochctrl &= 0x1c;
4528 /* If we have the recommended number of TLB entries (16), fine. */
4529 if (vtisochctrl == 0x10)
4532 /* Zero TLB entries? You get to ride the short bus to school. */
4534 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4535 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4536 dmi_get_system_info(DMI_BIOS_VENDOR),
4537 dmi_get_system_info(DMI_BIOS_VERSION),
4538 dmi_get_system_info(DMI_PRODUCT_VERSION));
4539 iommu_identity_mapping |= IDENTMAP_AZALIA;
4543 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",