2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
100 static inline int agaw_to_level(int agaw)
105 static inline int agaw_to_width(int agaw)
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
110 static inline int width_to_agaw(int width)
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
115 static inline unsigned int level_to_offset_bits(int level)
117 return (level - 1) * LEVEL_STRIDE;
120 static inline int pfn_level_offset(unsigned long pfn, int level)
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
125 static inline unsigned long level_mask(int level)
127 return -1UL << level_to_offset_bits(level);
130 static inline unsigned long level_size(int level)
132 return 1UL << level_to_offset_bits(level);
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 return (pfn + level_size(level) - 1) & level_mask(level);
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 return mm_to_dma_pfn(page_to_pfn(pg));
160 static inline unsigned long virt_to_dma_pfn(void *p)
162 return page_to_dma_pfn(virt_to_page(p));
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
175 static int force_on = 0;
180 * 12-63: Context Ptr (12 - (haw-1))
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
190 return (root->val & 1);
192 static inline void set_root_present(struct root_entry *root)
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 root->val &= ~VTD_PAGE_MASK;
199 root->val |= value & VTD_PAGE_MASK;
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
205 return (struct context_entry *)
206 (root_present(root)?phys_to_virt(
207 root->val & VTD_PAGE_MASK) :
214 * 1: fault processing disable
215 * 2-3: translation type
216 * 12-63: address space root
222 struct context_entry {
227 static inline bool context_present(struct context_entry *context)
229 return (context->lo & 1);
231 static inline void context_set_present(struct context_entry *context)
236 static inline void context_set_fault_enable(struct context_entry *context)
238 context->lo &= (((u64)-1) << 2) | 1;
241 static inline void context_set_translation_type(struct context_entry *context,
244 context->lo &= (((u64)-1) << 4) | 3;
245 context->lo |= (value & 3) << 2;
248 static inline void context_set_address_root(struct context_entry *context,
251 context->lo &= ~VTD_PAGE_MASK;
252 context->lo |= value & VTD_PAGE_MASK;
255 static inline void context_set_address_width(struct context_entry *context,
258 context->hi |= value & 7;
261 static inline void context_set_domain_id(struct context_entry *context,
264 context->hi |= (value & ((1 << 16) - 1)) << 8;
267 static inline void context_clear_entry(struct context_entry *context)
280 * 12-63: Host physcial address
286 static inline void dma_clear_pte(struct dma_pte *pte)
291 static inline u64 dma_pte_addr(struct dma_pte *pte)
294 return pte->val & VTD_PAGE_MASK;
296 /* Must have a full atomic 64-bit read */
297 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
301 static inline bool dma_pte_present(struct dma_pte *pte)
303 return (pte->val & 3) != 0;
306 static inline bool dma_pte_superpage(struct dma_pte *pte)
308 return (pte->val & DMA_PTE_LARGE_PAGE);
311 static inline int first_pte_in_page(struct dma_pte *pte)
313 return !((unsigned long)pte & ~VTD_PAGE_MASK);
317 * This domain is a statically identity mapping domain.
318 * 1. This domain creats a static 1:1 mapping to all usable memory.
319 * 2. It maps to each iommu if successful.
320 * 3. Each iommu mapps to this domain if successful.
322 static struct dmar_domain *si_domain;
323 static int hw_pass_through = 1;
325 /* domain represents a virtual machine, more than one devices
326 * across iommus may be owned in one domain, e.g. kvm guest.
328 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
330 /* si_domain contains mulitple devices */
331 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
334 int id; /* domain id */
335 int nid; /* node id */
336 DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
337 /* bitmap of iommus this domain uses*/
339 struct list_head devices; /* all devices' list */
340 struct iova_domain iovad; /* iova's that belong to this domain */
342 struct dma_pte *pgd; /* virtual address */
343 int gaw; /* max guest address width */
345 /* adjusted guest address width, 0 is level 2 30-bit */
348 int flags; /* flags to find out type of domain */
350 int iommu_coherency;/* indicate coherency of iommu access */
351 int iommu_snooping; /* indicate snooping control feature*/
352 int iommu_count; /* reference count of iommu */
353 int iommu_superpage;/* Level of superpages supported:
354 0 == 4KiB (no superpages), 1 == 2MiB,
355 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
356 spinlock_t iommu_lock; /* protect iommu set in domain */
357 u64 max_addr; /* maximum mapped address */
360 /* PCI domain-device relationship */
361 struct device_domain_info {
362 struct list_head link; /* link to domain siblings */
363 struct list_head global; /* link to global list */
364 u8 bus; /* PCI bus number */
365 u8 devfn; /* PCI devfn number */
366 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
367 struct intel_iommu *iommu; /* IOMMU used by this device */
368 struct dmar_domain *domain; /* pointer to domain */
371 struct dmar_rmrr_unit {
372 struct list_head list; /* list of rmrr units */
373 struct acpi_dmar_header *hdr; /* ACPI header */
374 u64 base_address; /* reserved base address*/
375 u64 end_address; /* reserved end address */
376 struct dmar_dev_scope *devices; /* target devices */
377 int devices_cnt; /* target device count */
380 struct dmar_atsr_unit {
381 struct list_head list; /* list of ATSR units */
382 struct acpi_dmar_header *hdr; /* ACPI header */
383 struct dmar_dev_scope *devices; /* target devices */
384 int devices_cnt; /* target device count */
385 u8 include_all:1; /* include all ports */
388 static LIST_HEAD(dmar_atsr_units);
389 static LIST_HEAD(dmar_rmrr_units);
391 #define for_each_rmrr_units(rmrr) \
392 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
394 static void flush_unmaps_timeout(unsigned long data);
396 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
398 #define HIGH_WATER_MARK 250
399 struct deferred_flush_tables {
401 struct iova *iova[HIGH_WATER_MARK];
402 struct dmar_domain *domain[HIGH_WATER_MARK];
403 struct page *freelist[HIGH_WATER_MARK];
406 static struct deferred_flush_tables *deferred_flush;
408 /* bitmap for indexing intel_iommus */
409 static int g_num_of_iommus;
411 static DEFINE_SPINLOCK(async_umap_flush_lock);
412 static LIST_HEAD(unmaps_to_do);
415 static long list_size;
417 static void domain_exit(struct dmar_domain *domain);
418 static void domain_remove_dev_info(struct dmar_domain *domain);
419 static void domain_remove_one_dev_info(struct dmar_domain *domain,
421 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
423 static int domain_detach_iommu(struct dmar_domain *domain,
424 struct intel_iommu *iommu);
426 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
427 int dmar_disabled = 0;
429 int dmar_disabled = 1;
430 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432 int intel_iommu_enabled = 0;
433 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435 static int dmar_map_gfx = 1;
436 static int dmar_forcedac;
437 static int intel_iommu_strict;
438 static int intel_iommu_superpage = 1;
440 int intel_iommu_gfx_mapped;
441 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
444 static DEFINE_SPINLOCK(device_domain_lock);
445 static LIST_HEAD(device_domain_list);
447 static const struct iommu_ops intel_iommu_ops;
449 static int __init intel_iommu_setup(char *str)
454 if (!strncmp(str, "on", 2)) {
456 printk(KERN_INFO "Intel-IOMMU: enabled\n");
457 } else if (!strncmp(str, "off", 3)) {
459 printk(KERN_INFO "Intel-IOMMU: disabled\n");
460 } else if (!strncmp(str, "igfx_off", 8)) {
463 "Intel-IOMMU: disable GFX device mapping\n");
464 } else if (!strncmp(str, "forcedac", 8)) {
466 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468 } else if (!strncmp(str, "strict", 6)) {
470 "Intel-IOMMU: disable batched IOTLB flush\n");
471 intel_iommu_strict = 1;
472 } else if (!strncmp(str, "sp_off", 6)) {
474 "Intel-IOMMU: disable supported super page\n");
475 intel_iommu_superpage = 0;
478 str += strcspn(str, ",");
484 __setup("intel_iommu=", intel_iommu_setup);
486 static struct kmem_cache *iommu_domain_cache;
487 static struct kmem_cache *iommu_devinfo_cache;
488 static struct kmem_cache *iommu_iova_cache;
490 static inline void *alloc_pgtable_page(int node)
495 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497 vaddr = page_address(page);
501 static inline void free_pgtable_page(void *vaddr)
503 free_page((unsigned long)vaddr);
506 static inline void *alloc_domain_mem(void)
508 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 static void free_domain_mem(void *vaddr)
513 kmem_cache_free(iommu_domain_cache, vaddr);
516 static inline void * alloc_devinfo_mem(void)
518 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 static inline void free_devinfo_mem(void *vaddr)
523 kmem_cache_free(iommu_devinfo_cache, vaddr);
526 struct iova *alloc_iova_mem(void)
528 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 void free_iova_mem(struct iova *iova)
533 kmem_cache_free(iommu_iova_cache, iova);
536 static inline int domain_type_is_vm(struct dmar_domain *domain)
538 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
541 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
543 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
544 DOMAIN_FLAG_STATIC_IDENTITY);
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
550 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
560 sagaw = cap_sagaw(iommu->cap);
561 for (agaw = width_to_agaw(max_gaw);
563 if (test_bit(agaw, &sagaw))
571 * Calculate max SAGAW for each iommu.
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
579 * calculate agaw for each iommu.
580 * "SAGAW" may be different across iommus, use a default agaw, and
581 * get a supported less agaw for iommus that don't support the default agaw.
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
588 /* This functionin only returns single iommu in a domain */
589 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
593 /* si_domain and vm domain should not get here. */
594 BUG_ON(domain_type_is_vm_or_si(domain));
595 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
596 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
599 return g_iommus[iommu_id];
602 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 struct dmar_drhd_unit *drhd;
605 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
685 struct dmar_drhd_unit *drhd = NULL;
686 struct intel_iommu *iommu;
688 struct pci_dev *ptmp, *pdev = NULL;
692 if (dev_is_pci(dev)) {
693 pdev = to_pci_dev(dev);
694 segment = pci_domain_nr(pdev->bus);
695 } else if (ACPI_COMPANION(dev))
696 dev = &ACPI_COMPANION(dev)->dev;
699 for_each_active_iommu(iommu, drhd) {
700 if (pdev && segment != drhd->segment)
703 for_each_active_dev_scope(drhd->devices,
704 drhd->devices_cnt, i, tmp) {
706 *bus = drhd->devices[i].bus;
707 *devfn = drhd->devices[i].devfn;
711 if (!pdev || !dev_is_pci(tmp))
714 ptmp = to_pci_dev(tmp);
715 if (ptmp->subordinate &&
716 ptmp->subordinate->number <= pdev->bus->number &&
717 ptmp->subordinate->busn_res.end >= pdev->bus->number)
721 if (pdev && drhd->include_all) {
723 *bus = pdev->bus->number;
724 *devfn = pdev->devfn;
735 static void domain_flush_cache(struct dmar_domain *domain,
736 void *addr, int size)
738 if (!domain->iommu_coherency)
739 clflush_cache_range(addr, size);
742 /* Gets context entry for a given bus and devfn */
743 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
746 struct root_entry *root;
747 struct context_entry *context;
748 unsigned long phy_addr;
751 spin_lock_irqsave(&iommu->lock, flags);
752 root = &iommu->root_entry[bus];
753 context = get_context_addr_from_root(root);
755 context = (struct context_entry *)
756 alloc_pgtable_page(iommu->node);
758 spin_unlock_irqrestore(&iommu->lock, flags);
761 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
762 phy_addr = virt_to_phys((void *)context);
763 set_root_value(root, phy_addr);
764 set_root_present(root);
765 __iommu_flush_cache(iommu, root, sizeof(*root));
767 spin_unlock_irqrestore(&iommu->lock, flags);
768 return &context[devfn];
771 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
773 struct root_entry *root;
774 struct context_entry *context;
778 spin_lock_irqsave(&iommu->lock, flags);
779 root = &iommu->root_entry[bus];
780 context = get_context_addr_from_root(root);
785 ret = context_present(&context[devfn]);
787 spin_unlock_irqrestore(&iommu->lock, flags);
791 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
793 struct root_entry *root;
794 struct context_entry *context;
797 spin_lock_irqsave(&iommu->lock, flags);
798 root = &iommu->root_entry[bus];
799 context = get_context_addr_from_root(root);
801 context_clear_entry(&context[devfn]);
802 __iommu_flush_cache(iommu, &context[devfn], \
805 spin_unlock_irqrestore(&iommu->lock, flags);
808 static void free_context_table(struct intel_iommu *iommu)
810 struct root_entry *root;
813 struct context_entry *context;
815 spin_lock_irqsave(&iommu->lock, flags);
816 if (!iommu->root_entry) {
819 for (i = 0; i < ROOT_ENTRY_NR; i++) {
820 root = &iommu->root_entry[i];
821 context = get_context_addr_from_root(root);
823 free_pgtable_page(context);
825 free_pgtable_page(iommu->root_entry);
826 iommu->root_entry = NULL;
828 spin_unlock_irqrestore(&iommu->lock, flags);
831 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
832 unsigned long pfn, int *target_level)
834 struct dma_pte *parent, *pte = NULL;
835 int level = agaw_to_level(domain->agaw);
838 BUG_ON(!domain->pgd);
840 if (!domain_pfn_supported(domain, pfn))
841 /* Address beyond IOMMU's addressing capabilities. */
844 parent = domain->pgd;
849 offset = pfn_level_offset(pfn, level);
850 pte = &parent[offset];
851 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
853 if (level == *target_level)
856 if (!dma_pte_present(pte)) {
859 tmp_page = alloc_pgtable_page(domain->nid);
864 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
865 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
866 if (cmpxchg64(&pte->val, 0ULL, pteval))
867 /* Someone else set it while we were thinking; use theirs. */
868 free_pgtable_page(tmp_page);
870 domain_flush_cache(domain, pte, sizeof(*pte));
875 parent = phys_to_virt(dma_pte_addr(pte));
880 *target_level = level;
886 /* return address's pte at specific level */
887 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
889 int level, int *large_page)
891 struct dma_pte *parent, *pte = NULL;
892 int total = agaw_to_level(domain->agaw);
895 parent = domain->pgd;
896 while (level <= total) {
897 offset = pfn_level_offset(pfn, total);
898 pte = &parent[offset];
902 if (!dma_pte_present(pte)) {
907 if (dma_pte_superpage(pte)) {
912 parent = phys_to_virt(dma_pte_addr(pte));
918 /* clear last level pte, a tlb flush should be followed */
919 static void dma_pte_clear_range(struct dmar_domain *domain,
920 unsigned long start_pfn,
921 unsigned long last_pfn)
923 unsigned int large_page = 1;
924 struct dma_pte *first_pte, *pte;
926 BUG_ON(!domain_pfn_supported(domain, start_pfn));
927 BUG_ON(!domain_pfn_supported(domain, last_pfn));
928 BUG_ON(start_pfn > last_pfn);
930 /* we don't need lock here; nobody else touches the iova range */
933 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
935 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
940 start_pfn += lvl_to_nr_pages(large_page);
942 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
944 domain_flush_cache(domain, first_pte,
945 (void *)pte - (void *)first_pte);
947 } while (start_pfn && start_pfn <= last_pfn);
950 static void dma_pte_free_level(struct dmar_domain *domain, int level,
951 struct dma_pte *pte, unsigned long pfn,
952 unsigned long start_pfn, unsigned long last_pfn)
954 pfn = max(start_pfn, pfn);
955 pte = &pte[pfn_level_offset(pfn, level)];
958 unsigned long level_pfn;
959 struct dma_pte *level_pte;
961 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
964 level_pfn = pfn & level_mask(level - 1);
965 level_pte = phys_to_virt(dma_pte_addr(pte));
968 dma_pte_free_level(domain, level - 1, level_pte,
969 level_pfn, start_pfn, last_pfn);
971 /* If range covers entire pagetable, free it */
972 if (!(start_pfn > level_pfn ||
973 last_pfn < level_pfn + level_size(level) - 1)) {
975 domain_flush_cache(domain, pte, sizeof(*pte));
976 free_pgtable_page(level_pte);
979 pfn += level_size(level);
980 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
983 /* free page table pages. last level pte should already be cleared */
984 static void dma_pte_free_pagetable(struct dmar_domain *domain,
985 unsigned long start_pfn,
986 unsigned long last_pfn)
988 BUG_ON(!domain_pfn_supported(domain, start_pfn));
989 BUG_ON(!domain_pfn_supported(domain, last_pfn));
990 BUG_ON(start_pfn > last_pfn);
992 dma_pte_clear_range(domain, start_pfn, last_pfn);
994 /* We don't need lock here; nobody else touches the iova range */
995 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
996 domain->pgd, 0, start_pfn, last_pfn);
999 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1000 free_pgtable_page(domain->pgd);
1005 /* When a page at a given level is being unlinked from its parent, we don't
1006 need to *modify* it at all. All we need to do is make a list of all the
1007 pages which can be freed just as soon as we've flushed the IOTLB and we
1008 know the hardware page-walk will no longer touch them.
1009 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1011 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1012 int level, struct dma_pte *pte,
1013 struct page *freelist)
1017 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1018 pg->freelist = freelist;
1024 pte = page_address(pg);
1026 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1027 freelist = dma_pte_list_pagetables(domain, level - 1,
1030 } while (!first_pte_in_page(pte));
1035 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1036 struct dma_pte *pte, unsigned long pfn,
1037 unsigned long start_pfn,
1038 unsigned long last_pfn,
1039 struct page *freelist)
1041 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1043 pfn = max(start_pfn, pfn);
1044 pte = &pte[pfn_level_offset(pfn, level)];
1047 unsigned long level_pfn;
1049 if (!dma_pte_present(pte))
1052 level_pfn = pfn & level_mask(level);
1054 /* If range covers entire pagetable, free it */
1055 if (start_pfn <= level_pfn &&
1056 last_pfn >= level_pfn + level_size(level) - 1) {
1057 /* These suborbinate page tables are going away entirely. Don't
1058 bother to clear them; we're just going to *free* them. */
1059 if (level > 1 && !dma_pte_superpage(pte))
1060 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1066 } else if (level > 1) {
1067 /* Recurse down into a level that isn't *entirely* obsolete */
1068 freelist = dma_pte_clear_level(domain, level - 1,
1069 phys_to_virt(dma_pte_addr(pte)),
1070 level_pfn, start_pfn, last_pfn,
1074 pfn += level_size(level);
1075 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1078 domain_flush_cache(domain, first_pte,
1079 (void *)++last_pte - (void *)first_pte);
1084 /* We can't just free the pages because the IOMMU may still be walking
1085 the page tables, and may have cached the intermediate levels. The
1086 pages can only be freed after the IOTLB flush has been done. */
1087 struct page *domain_unmap(struct dmar_domain *domain,
1088 unsigned long start_pfn,
1089 unsigned long last_pfn)
1091 struct page *freelist = NULL;
1093 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1094 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1095 BUG_ON(start_pfn > last_pfn);
1097 /* we don't need lock here; nobody else touches the iova range */
1098 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1099 domain->pgd, 0, start_pfn, last_pfn, NULL);
1102 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1103 struct page *pgd_page = virt_to_page(domain->pgd);
1104 pgd_page->freelist = freelist;
1105 freelist = pgd_page;
1113 void dma_free_pagelist(struct page *freelist)
1117 while ((pg = freelist)) {
1118 freelist = pg->freelist;
1119 free_pgtable_page(page_address(pg));
1123 /* iommu handling */
1124 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1126 struct root_entry *root;
1127 unsigned long flags;
1129 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1131 pr_err("IOMMU: allocating root entry for %s failed\n",
1136 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1138 spin_lock_irqsave(&iommu->lock, flags);
1139 iommu->root_entry = root;
1140 spin_unlock_irqrestore(&iommu->lock, flags);
1145 static void iommu_set_root_entry(struct intel_iommu *iommu)
1151 addr = iommu->root_entry;
1153 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1154 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1156 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1158 /* Make sure hardware complete it */
1159 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1160 readl, (sts & DMA_GSTS_RTPS), sts);
1162 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1165 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1170 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1173 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1174 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1176 /* Make sure hardware complete it */
1177 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1178 readl, (!(val & DMA_GSTS_WBFS)), val);
1180 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1183 /* return value determine if we need a write buffer flush */
1184 static void __iommu_flush_context(struct intel_iommu *iommu,
1185 u16 did, u16 source_id, u8 function_mask,
1192 case DMA_CCMD_GLOBAL_INVL:
1193 val = DMA_CCMD_GLOBAL_INVL;
1195 case DMA_CCMD_DOMAIN_INVL:
1196 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1198 case DMA_CCMD_DEVICE_INVL:
1199 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1200 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1205 val |= DMA_CCMD_ICC;
1207 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1208 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1210 /* Make sure hardware complete it */
1211 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1212 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1214 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1217 /* return value determine if we need a write buffer flush */
1218 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1219 u64 addr, unsigned int size_order, u64 type)
1221 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1222 u64 val = 0, val_iva = 0;
1226 case DMA_TLB_GLOBAL_FLUSH:
1227 /* global flush doesn't need set IVA_REG */
1228 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1230 case DMA_TLB_DSI_FLUSH:
1231 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1233 case DMA_TLB_PSI_FLUSH:
1234 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1235 /* IH bit is passed in as part of address */
1236 val_iva = size_order | addr;
1241 /* Note: set drain read/write */
1244 * This is probably to be super secure.. Looks like we can
1245 * ignore it without any impact.
1247 if (cap_read_drain(iommu->cap))
1248 val |= DMA_TLB_READ_DRAIN;
1250 if (cap_write_drain(iommu->cap))
1251 val |= DMA_TLB_WRITE_DRAIN;
1253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 /* Note: Only uses first TLB reg currently */
1256 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1257 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1259 /* Make sure hardware complete it */
1260 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1261 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1263 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1265 /* check IOTLB invalidation granularity */
1266 if (DMA_TLB_IAIG(val) == 0)
1267 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1268 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1269 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1270 (unsigned long long)DMA_TLB_IIRG(type),
1271 (unsigned long long)DMA_TLB_IAIG(val));
1274 static struct device_domain_info *
1275 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1279 unsigned long flags;
1280 struct device_domain_info *info;
1281 struct pci_dev *pdev;
1283 if (!ecap_dev_iotlb_support(iommu->ecap))
1289 spin_lock_irqsave(&device_domain_lock, flags);
1290 list_for_each_entry(info, &domain->devices, link)
1291 if (info->iommu == iommu && info->bus == bus &&
1292 info->devfn == devfn) {
1296 spin_unlock_irqrestore(&device_domain_lock, flags);
1298 if (!found || !info->dev || !dev_is_pci(info->dev))
1301 pdev = to_pci_dev(info->dev);
1303 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1306 if (!dmar_find_matched_atsr_unit(pdev))
1312 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1314 if (!info || !dev_is_pci(info->dev))
1317 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1320 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1322 if (!info->dev || !dev_is_pci(info->dev) ||
1323 !pci_ats_enabled(to_pci_dev(info->dev)))
1326 pci_disable_ats(to_pci_dev(info->dev));
1329 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1330 u64 addr, unsigned mask)
1333 unsigned long flags;
1334 struct device_domain_info *info;
1336 spin_lock_irqsave(&device_domain_lock, flags);
1337 list_for_each_entry(info, &domain->devices, link) {
1338 struct pci_dev *pdev;
1339 if (!info->dev || !dev_is_pci(info->dev))
1342 pdev = to_pci_dev(info->dev);
1343 if (!pci_ats_enabled(pdev))
1346 sid = info->bus << 8 | info->devfn;
1347 qdep = pci_ats_queue_depth(pdev);
1348 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1350 spin_unlock_irqrestore(&device_domain_lock, flags);
1353 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1354 unsigned long pfn, unsigned int pages, int ih, int map)
1356 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1357 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1364 * Fallback to domain selective flush if no PSI support or the size is
1366 * PSI requires page size to be 2 ^ x, and the base address is naturally
1367 * aligned to the size
1369 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1370 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1373 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1377 * In caching mode, changes of pages from non-present to present require
1378 * flush. However, device IOTLB doesn't need to be flushed in this case.
1380 if (!cap_caching_mode(iommu->cap) || !map)
1381 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1384 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1387 unsigned long flags;
1389 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1390 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1391 pmen &= ~DMA_PMEN_EPM;
1392 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1394 /* wait for the protected region status bit to clear */
1395 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1396 readl, !(pmen & DMA_PMEN_PRS), pmen);
1398 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401 static void iommu_enable_translation(struct intel_iommu *iommu)
1404 unsigned long flags;
1406 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1407 iommu->gcmd |= DMA_GCMD_TE;
1408 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1410 /* Make sure hardware complete it */
1411 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1412 readl, (sts & DMA_GSTS_TES), sts);
1414 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1417 static void iommu_disable_translation(struct intel_iommu *iommu)
1422 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 iommu->gcmd &= ~DMA_GCMD_TE;
1424 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1426 /* Make sure hardware complete it */
1427 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1428 readl, (!(sts & DMA_GSTS_TES)), sts);
1430 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1434 static int iommu_init_domains(struct intel_iommu *iommu)
1436 unsigned long ndomains;
1437 unsigned long nlongs;
1439 ndomains = cap_ndoms(iommu->cap);
1440 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1441 iommu->seq_id, ndomains);
1442 nlongs = BITS_TO_LONGS(ndomains);
1444 spin_lock_init(&iommu->lock);
1446 /* TBD: there might be 64K domains,
1447 * consider other allocation for future chip
1449 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1450 if (!iommu->domain_ids) {
1451 pr_err("IOMMU%d: allocating domain id array failed\n",
1455 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1457 if (!iommu->domains) {
1458 pr_err("IOMMU%d: allocating domain array failed\n",
1460 kfree(iommu->domain_ids);
1461 iommu->domain_ids = NULL;
1466 * if Caching mode is set, then invalid translations are tagged
1467 * with domainid 0. Hence we need to pre-allocate it.
1469 if (cap_caching_mode(iommu->cap))
1470 set_bit(0, iommu->domain_ids);
1474 static void disable_dmar_iommu(struct intel_iommu *iommu)
1476 struct dmar_domain *domain;
1479 if ((iommu->domains) && (iommu->domain_ids)) {
1480 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1482 * Domain id 0 is reserved for invalid translation
1483 * if hardware supports caching mode.
1485 if (cap_caching_mode(iommu->cap) && i == 0)
1488 domain = iommu->domains[i];
1489 clear_bit(i, iommu->domain_ids);
1490 if (domain_detach_iommu(domain, iommu) == 0 &&
1491 !domain_type_is_vm(domain))
1492 domain_exit(domain);
1496 if (iommu->gcmd & DMA_GCMD_TE)
1497 iommu_disable_translation(iommu);
1500 static void free_dmar_iommu(struct intel_iommu *iommu)
1502 if ((iommu->domains) && (iommu->domain_ids)) {
1503 kfree(iommu->domains);
1504 kfree(iommu->domain_ids);
1505 iommu->domains = NULL;
1506 iommu->domain_ids = NULL;
1509 g_iommus[iommu->seq_id] = NULL;
1511 /* free context mapping */
1512 free_context_table(iommu);
1515 static struct dmar_domain *alloc_domain(int flags)
1517 /* domain id for virtual machine, it won't be set in context */
1518 static atomic_t vm_domid = ATOMIC_INIT(0);
1519 struct dmar_domain *domain;
1521 domain = alloc_domain_mem();
1525 memset(domain, 0, sizeof(*domain));
1527 domain->flags = flags;
1528 spin_lock_init(&domain->iommu_lock);
1529 INIT_LIST_HEAD(&domain->devices);
1530 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1531 domain->id = atomic_inc_return(&vm_domid);
1536 static int __iommu_attach_domain(struct dmar_domain *domain,
1537 struct intel_iommu *iommu)
1540 unsigned long ndomains;
1542 ndomains = cap_ndoms(iommu->cap);
1543 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1544 if (num < ndomains) {
1545 set_bit(num, iommu->domain_ids);
1546 iommu->domains[num] = domain;
1554 static int iommu_attach_domain(struct dmar_domain *domain,
1555 struct intel_iommu *iommu)
1558 unsigned long flags;
1560 spin_lock_irqsave(&iommu->lock, flags);
1561 num = __iommu_attach_domain(domain, iommu);
1562 spin_unlock_irqrestore(&iommu->lock, flags);
1564 pr_err("IOMMU: no free domain ids\n");
1569 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1570 struct intel_iommu *iommu)
1573 unsigned long ndomains;
1575 ndomains = cap_ndoms(iommu->cap);
1576 for_each_set_bit(num, iommu->domain_ids, ndomains)
1577 if (iommu->domains[num] == domain)
1580 return __iommu_attach_domain(domain, iommu);
1583 static void iommu_detach_domain(struct dmar_domain *domain,
1584 struct intel_iommu *iommu)
1586 unsigned long flags;
1589 spin_lock_irqsave(&iommu->lock, flags);
1590 if (domain_type_is_vm_or_si(domain)) {
1591 ndomains = cap_ndoms(iommu->cap);
1592 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1593 if (iommu->domains[num] == domain) {
1594 clear_bit(num, iommu->domain_ids);
1595 iommu->domains[num] = NULL;
1600 clear_bit(domain->id, iommu->domain_ids);
1601 iommu->domains[domain->id] = NULL;
1603 spin_unlock_irqrestore(&iommu->lock, flags);
1606 static void domain_attach_iommu(struct dmar_domain *domain,
1607 struct intel_iommu *iommu)
1609 unsigned long flags;
1611 spin_lock_irqsave(&domain->iommu_lock, flags);
1612 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1613 domain->iommu_count++;
1614 if (domain->iommu_count == 1)
1615 domain->nid = iommu->node;
1616 domain_update_iommu_cap(domain);
1618 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1621 static int domain_detach_iommu(struct dmar_domain *domain,
1622 struct intel_iommu *iommu)
1624 unsigned long flags;
1625 int count = INT_MAX;
1627 spin_lock_irqsave(&domain->iommu_lock, flags);
1628 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1629 count = --domain->iommu_count;
1630 domain_update_iommu_cap(domain);
1632 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1637 static struct iova_domain reserved_iova_list;
1638 static struct lock_class_key reserved_rbtree_key;
1640 static int dmar_init_reserved_ranges(void)
1642 struct pci_dev *pdev = NULL;
1646 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1648 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1649 &reserved_rbtree_key);
1651 /* IOAPIC ranges shouldn't be accessed by DMA */
1652 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1653 IOVA_PFN(IOAPIC_RANGE_END));
1655 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1659 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1660 for_each_pci_dev(pdev) {
1663 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1664 r = &pdev->resource[i];
1665 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1667 iova = reserve_iova(&reserved_iova_list,
1671 printk(KERN_ERR "Reserve iova failed\n");
1679 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1681 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1684 static inline int guestwidth_to_adjustwidth(int gaw)
1687 int r = (gaw - 12) % 9;
1698 static int domain_init(struct dmar_domain *domain, int guest_width)
1700 struct intel_iommu *iommu;
1701 int adjust_width, agaw;
1702 unsigned long sagaw;
1704 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1705 domain_reserve_special_ranges(domain);
1707 /* calculate AGAW */
1708 iommu = domain_get_iommu(domain);
1709 if (guest_width > cap_mgaw(iommu->cap))
1710 guest_width = cap_mgaw(iommu->cap);
1711 domain->gaw = guest_width;
1712 adjust_width = guestwidth_to_adjustwidth(guest_width);
1713 agaw = width_to_agaw(adjust_width);
1714 sagaw = cap_sagaw(iommu->cap);
1715 if (!test_bit(agaw, &sagaw)) {
1716 /* hardware doesn't support it, choose a bigger one */
1717 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1718 agaw = find_next_bit(&sagaw, 5, agaw);
1722 domain->agaw = agaw;
1724 if (ecap_coherent(iommu->ecap))
1725 domain->iommu_coherency = 1;
1727 domain->iommu_coherency = 0;
1729 if (ecap_sc_support(iommu->ecap))
1730 domain->iommu_snooping = 1;
1732 domain->iommu_snooping = 0;
1734 if (intel_iommu_superpage)
1735 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1737 domain->iommu_superpage = 0;
1739 domain->nid = iommu->node;
1741 /* always allocate the top pgd */
1742 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1745 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1749 static void domain_exit(struct dmar_domain *domain)
1751 struct dmar_drhd_unit *drhd;
1752 struct intel_iommu *iommu;
1753 struct page *freelist = NULL;
1755 /* Domain 0 is reserved, so dont process it */
1759 /* Flush any lazy unmaps that may reference this domain */
1760 if (!intel_iommu_strict)
1761 flush_unmaps_timeout(0);
1763 /* remove associated devices */
1764 domain_remove_dev_info(domain);
1767 put_iova_domain(&domain->iovad);
1769 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1771 /* clear attached or cached domains */
1773 for_each_active_iommu(iommu, drhd)
1774 iommu_detach_domain(domain, iommu);
1777 dma_free_pagelist(freelist);
1779 free_domain_mem(domain);
1782 static int domain_context_mapping_one(struct dmar_domain *domain,
1783 struct intel_iommu *iommu,
1784 u8 bus, u8 devfn, int translation)
1786 struct context_entry *context;
1787 unsigned long flags;
1788 struct dma_pte *pgd;
1791 struct device_domain_info *info = NULL;
1793 pr_debug("Set context mapping for %02x:%02x.%d\n",
1794 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1796 BUG_ON(!domain->pgd);
1797 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1798 translation != CONTEXT_TT_MULTI_LEVEL);
1800 context = device_to_context_entry(iommu, bus, devfn);
1803 spin_lock_irqsave(&iommu->lock, flags);
1804 if (context_present(context)) {
1805 spin_unlock_irqrestore(&iommu->lock, flags);
1812 if (domain_type_is_vm_or_si(domain)) {
1813 if (domain_type_is_vm(domain)) {
1814 id = iommu_attach_vm_domain(domain, iommu);
1816 spin_unlock_irqrestore(&iommu->lock, flags);
1817 pr_err("IOMMU: no free domain ids\n");
1822 /* Skip top levels of page tables for
1823 * iommu which has less agaw than default.
1824 * Unnecessary for PT mode.
1826 if (translation != CONTEXT_TT_PASS_THROUGH) {
1827 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1828 pgd = phys_to_virt(dma_pte_addr(pgd));
1829 if (!dma_pte_present(pgd)) {
1830 spin_unlock_irqrestore(&iommu->lock, flags);
1837 context_set_domain_id(context, id);
1839 if (translation != CONTEXT_TT_PASS_THROUGH) {
1840 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1841 translation = info ? CONTEXT_TT_DEV_IOTLB :
1842 CONTEXT_TT_MULTI_LEVEL;
1845 * In pass through mode, AW must be programmed to indicate the largest
1846 * AGAW value supported by hardware. And ASR is ignored by hardware.
1848 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1849 context_set_address_width(context, iommu->msagaw);
1851 context_set_address_root(context, virt_to_phys(pgd));
1852 context_set_address_width(context, iommu->agaw);
1855 context_set_translation_type(context, translation);
1856 context_set_fault_enable(context);
1857 context_set_present(context);
1858 domain_flush_cache(domain, context, sizeof(*context));
1861 * It's a non-present to present mapping. If hardware doesn't cache
1862 * non-present entry we only need to flush the write-buffer. If the
1863 * _does_ cache non-present entries, then it does so in the special
1864 * domain #0, which we have to flush:
1866 if (cap_caching_mode(iommu->cap)) {
1867 iommu->flush.flush_context(iommu, 0,
1868 (((u16)bus) << 8) | devfn,
1869 DMA_CCMD_MASK_NOBIT,
1870 DMA_CCMD_DEVICE_INVL);
1871 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1873 iommu_flush_write_buffer(iommu);
1875 iommu_enable_dev_iotlb(info);
1876 spin_unlock_irqrestore(&iommu->lock, flags);
1878 domain_attach_iommu(domain, iommu);
1883 struct domain_context_mapping_data {
1884 struct dmar_domain *domain;
1885 struct intel_iommu *iommu;
1889 static int domain_context_mapping_cb(struct pci_dev *pdev,
1890 u16 alias, void *opaque)
1892 struct domain_context_mapping_data *data = opaque;
1894 return domain_context_mapping_one(data->domain, data->iommu,
1895 PCI_BUS_NUM(alias), alias & 0xff,
1900 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1903 struct intel_iommu *iommu;
1905 struct domain_context_mapping_data data;
1907 iommu = device_to_iommu(dev, &bus, &devfn);
1911 if (!dev_is_pci(dev))
1912 return domain_context_mapping_one(domain, iommu, bus, devfn,
1915 data.domain = domain;
1917 data.translation = translation;
1919 return pci_for_each_dma_alias(to_pci_dev(dev),
1920 &domain_context_mapping_cb, &data);
1923 static int domain_context_mapped_cb(struct pci_dev *pdev,
1924 u16 alias, void *opaque)
1926 struct intel_iommu *iommu = opaque;
1928 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1931 static int domain_context_mapped(struct device *dev)
1933 struct intel_iommu *iommu;
1936 iommu = device_to_iommu(dev, &bus, &devfn);
1940 if (!dev_is_pci(dev))
1941 return device_context_mapped(iommu, bus, devfn);
1943 return !pci_for_each_dma_alias(to_pci_dev(dev),
1944 domain_context_mapped_cb, iommu);
1947 /* Returns a number of VTD pages, but aligned to MM page size */
1948 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1951 host_addr &= ~PAGE_MASK;
1952 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1955 /* Return largest possible superpage level for a given mapping */
1956 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1957 unsigned long iov_pfn,
1958 unsigned long phy_pfn,
1959 unsigned long pages)
1961 int support, level = 1;
1962 unsigned long pfnmerge;
1964 support = domain->iommu_superpage;
1966 /* To use a large page, the virtual *and* physical addresses
1967 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1968 of them will mean we have to use smaller pages. So just
1969 merge them and check both at once. */
1970 pfnmerge = iov_pfn | phy_pfn;
1972 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1973 pages >>= VTD_STRIDE_SHIFT;
1976 pfnmerge >>= VTD_STRIDE_SHIFT;
1983 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1984 struct scatterlist *sg, unsigned long phys_pfn,
1985 unsigned long nr_pages, int prot)
1987 struct dma_pte *first_pte = NULL, *pte = NULL;
1988 phys_addr_t uninitialized_var(pteval);
1989 unsigned long sg_res;
1990 unsigned int largepage_lvl = 0;
1991 unsigned long lvl_pages = 0;
1993 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1995 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1998 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2003 sg_res = nr_pages + 1;
2004 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2007 while (nr_pages > 0) {
2011 sg_res = aligned_nrpages(sg->offset, sg->length);
2012 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2013 sg->dma_length = sg->length;
2014 pteval = page_to_phys(sg_page(sg)) | prot;
2015 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2019 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2021 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2024 /* It is large page*/
2025 if (largepage_lvl > 1) {
2026 pteval |= DMA_PTE_LARGE_PAGE;
2027 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2029 * Ensure that old small page tables are
2030 * removed to make room for superpage,
2033 dma_pte_free_pagetable(domain, iov_pfn,
2034 iov_pfn + lvl_pages - 1);
2036 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2040 /* We don't need lock here, nobody else
2041 * touches the iova range
2043 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2045 static int dumps = 5;
2046 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2047 iov_pfn, tmp, (unsigned long long)pteval);
2050 debug_dma_dump_mappings(NULL);
2055 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2057 BUG_ON(nr_pages < lvl_pages);
2058 BUG_ON(sg_res < lvl_pages);
2060 nr_pages -= lvl_pages;
2061 iov_pfn += lvl_pages;
2062 phys_pfn += lvl_pages;
2063 pteval += lvl_pages * VTD_PAGE_SIZE;
2064 sg_res -= lvl_pages;
2066 /* If the next PTE would be the first in a new page, then we
2067 need to flush the cache on the entries we've just written.
2068 And then we'll need to recalculate 'pte', so clear it and
2069 let it get set again in the if (!pte) block above.
2071 If we're done (!nr_pages) we need to flush the cache too.
2073 Also if we've been setting superpages, we may need to
2074 recalculate 'pte' and switch back to smaller pages for the
2075 end of the mapping, if the trailing size is not enough to
2076 use another superpage (i.e. sg_res < lvl_pages). */
2078 if (!nr_pages || first_pte_in_page(pte) ||
2079 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2080 domain_flush_cache(domain, first_pte,
2081 (void *)pte - (void *)first_pte);
2085 if (!sg_res && nr_pages)
2091 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2092 struct scatterlist *sg, unsigned long nr_pages,
2095 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2098 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2099 unsigned long phys_pfn, unsigned long nr_pages,
2102 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2105 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2110 clear_context_table(iommu, bus, devfn);
2111 iommu->flush.flush_context(iommu, 0, 0, 0,
2112 DMA_CCMD_GLOBAL_INVL);
2113 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2116 static inline void unlink_domain_info(struct device_domain_info *info)
2118 assert_spin_locked(&device_domain_lock);
2119 list_del(&info->link);
2120 list_del(&info->global);
2122 info->dev->archdata.iommu = NULL;
2125 static void domain_remove_dev_info(struct dmar_domain *domain)
2127 struct device_domain_info *info, *tmp;
2128 unsigned long flags;
2130 spin_lock_irqsave(&device_domain_lock, flags);
2131 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2132 unlink_domain_info(info);
2133 spin_unlock_irqrestore(&device_domain_lock, flags);
2135 iommu_disable_dev_iotlb(info);
2136 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2138 if (domain_type_is_vm(domain)) {
2139 iommu_detach_dependent_devices(info->iommu, info->dev);
2140 domain_detach_iommu(domain, info->iommu);
2143 free_devinfo_mem(info);
2144 spin_lock_irqsave(&device_domain_lock, flags);
2146 spin_unlock_irqrestore(&device_domain_lock, flags);
2151 * Note: we use struct device->archdata.iommu stores the info
2153 static struct dmar_domain *find_domain(struct device *dev)
2155 struct device_domain_info *info;
2157 /* No lock here, assumes no domain exit in normal case */
2158 info = dev->archdata.iommu;
2160 return info->domain;
2164 static inline struct device_domain_info *
2165 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2167 struct device_domain_info *info;
2169 list_for_each_entry(info, &device_domain_list, global)
2170 if (info->iommu->segment == segment && info->bus == bus &&
2171 info->devfn == devfn)
2177 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2180 struct dmar_domain *domain)
2182 struct dmar_domain *found = NULL;
2183 struct device_domain_info *info;
2184 unsigned long flags;
2186 info = alloc_devinfo_mem();
2191 info->devfn = devfn;
2193 info->domain = domain;
2194 info->iommu = iommu;
2196 spin_lock_irqsave(&device_domain_lock, flags);
2198 found = find_domain(dev);
2200 struct device_domain_info *info2;
2201 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2203 found = info2->domain;
2206 spin_unlock_irqrestore(&device_domain_lock, flags);
2207 free_devinfo_mem(info);
2208 /* Caller must free the original domain */
2212 list_add(&info->link, &domain->devices);
2213 list_add(&info->global, &device_domain_list);
2215 dev->archdata.iommu = info;
2216 spin_unlock_irqrestore(&device_domain_lock, flags);
2221 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2223 *(u16 *)opaque = alias;
2227 /* domain is initialized */
2228 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2230 struct dmar_domain *domain, *tmp;
2231 struct intel_iommu *iommu;
2232 struct device_domain_info *info;
2234 unsigned long flags;
2237 domain = find_domain(dev);
2241 iommu = device_to_iommu(dev, &bus, &devfn);
2245 if (dev_is_pci(dev)) {
2246 struct pci_dev *pdev = to_pci_dev(dev);
2248 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2250 spin_lock_irqsave(&device_domain_lock, flags);
2251 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2252 PCI_BUS_NUM(dma_alias),
2255 iommu = info->iommu;
2256 domain = info->domain;
2258 spin_unlock_irqrestore(&device_domain_lock, flags);
2260 /* DMA alias already has a domain, uses it */
2265 /* Allocate and initialize new domain for the device */
2266 domain = alloc_domain(0);
2269 domain->id = iommu_attach_domain(domain, iommu);
2270 if (domain->id < 0) {
2271 free_domain_mem(domain);
2274 domain_attach_iommu(domain, iommu);
2275 if (domain_init(domain, gaw)) {
2276 domain_exit(domain);
2280 /* register PCI DMA alias device */
2281 if (dev_is_pci(dev)) {
2282 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2283 dma_alias & 0xff, NULL, domain);
2285 if (!tmp || tmp != domain) {
2286 domain_exit(domain);
2295 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2297 if (!tmp || tmp != domain) {
2298 domain_exit(domain);
2305 static int iommu_identity_mapping;
2306 #define IDENTMAP_ALL 1
2307 #define IDENTMAP_GFX 2
2308 #define IDENTMAP_AZALIA 4
2310 static int iommu_domain_identity_map(struct dmar_domain *domain,
2311 unsigned long long start,
2312 unsigned long long end)
2314 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2315 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2317 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2318 dma_to_mm_pfn(last_vpfn))) {
2319 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2323 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2324 start, end, domain->id);
2326 * RMRR range might have overlap with physical memory range,
2329 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2331 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2332 last_vpfn - first_vpfn + 1,
2333 DMA_PTE_READ|DMA_PTE_WRITE);
2336 static int iommu_prepare_identity_map(struct device *dev,
2337 unsigned long long start,
2338 unsigned long long end)
2340 struct dmar_domain *domain;
2343 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2347 /* For _hardware_ passthrough, don't bother. But for software
2348 passthrough, we do it anyway -- it may indicate a memory
2349 range which is reserved in E820, so which didn't get set
2350 up to start with in si_domain */
2351 if (domain == si_domain && hw_pass_through) {
2352 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2353 dev_name(dev), start, end);
2358 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2359 dev_name(dev), start, end);
2362 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2363 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2364 dmi_get_system_info(DMI_BIOS_VENDOR),
2365 dmi_get_system_info(DMI_BIOS_VERSION),
2366 dmi_get_system_info(DMI_PRODUCT_VERSION));
2371 if (end >> agaw_to_width(domain->agaw)) {
2372 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2373 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2374 agaw_to_width(domain->agaw),
2375 dmi_get_system_info(DMI_BIOS_VENDOR),
2376 dmi_get_system_info(DMI_BIOS_VERSION),
2377 dmi_get_system_info(DMI_PRODUCT_VERSION));
2382 ret = iommu_domain_identity_map(domain, start, end);
2386 /* context entry init */
2387 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2394 domain_exit(domain);
2398 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2401 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2403 return iommu_prepare_identity_map(dev, rmrr->base_address,
2407 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2408 static inline void iommu_prepare_isa(void)
2410 struct pci_dev *pdev;
2413 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2417 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2418 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2421 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2422 "floppy might not work\n");
2427 static inline void iommu_prepare_isa(void)
2431 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2433 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2435 static int __init si_domain_init(int hw)
2437 struct dmar_drhd_unit *drhd;
2438 struct intel_iommu *iommu;
2442 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2446 for_each_active_iommu(iommu, drhd) {
2447 ret = iommu_attach_domain(si_domain, iommu);
2449 domain_exit(si_domain);
2452 si_domain->id = ret;
2454 } else if (si_domain->id != ret) {
2455 domain_exit(si_domain);
2458 domain_attach_iommu(si_domain, iommu);
2461 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2462 domain_exit(si_domain);
2466 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2472 for_each_online_node(nid) {
2473 unsigned long start_pfn, end_pfn;
2476 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2477 ret = iommu_domain_identity_map(si_domain,
2478 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2487 static int identity_mapping(struct device *dev)
2489 struct device_domain_info *info;
2491 if (likely(!iommu_identity_mapping))
2494 info = dev->archdata.iommu;
2495 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2496 return (info->domain == si_domain);
2501 static int domain_add_dev_info(struct dmar_domain *domain,
2502 struct device *dev, int translation)
2504 struct dmar_domain *ndomain;
2505 struct intel_iommu *iommu;
2509 iommu = device_to_iommu(dev, &bus, &devfn);
2513 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2514 if (ndomain != domain)
2517 ret = domain_context_mapping(domain, dev, translation);
2519 domain_remove_one_dev_info(domain, dev);
2526 static bool device_has_rmrr(struct device *dev)
2528 struct dmar_rmrr_unit *rmrr;
2533 for_each_rmrr_units(rmrr) {
2535 * Return TRUE if this RMRR contains the device that
2538 for_each_active_dev_scope(rmrr->devices,
2539 rmrr->devices_cnt, i, tmp)
2550 * There are a couple cases where we need to restrict the functionality of
2551 * devices associated with RMRRs. The first is when evaluating a device for
2552 * identity mapping because problems exist when devices are moved in and out
2553 * of domains and their respective RMRR information is lost. This means that
2554 * a device with associated RMRRs will never be in a "passthrough" domain.
2555 * The second is use of the device through the IOMMU API. This interface
2556 * expects to have full control of the IOVA space for the device. We cannot
2557 * satisfy both the requirement that RMRR access is maintained and have an
2558 * unencumbered IOVA space. We also have no ability to quiesce the device's
2559 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2560 * We therefore prevent devices associated with an RMRR from participating in
2561 * the IOMMU API, which eliminates them from device assignment.
2563 * In both cases we assume that PCI USB devices with RMRRs have them largely
2564 * for historical reasons and that the RMRR space is not actively used post
2565 * boot. This exclusion may change if vendors begin to abuse it.
2567 static bool device_is_rmrr_locked(struct device *dev)
2569 if (!device_has_rmrr(dev))
2572 if (dev_is_pci(dev)) {
2573 struct pci_dev *pdev = to_pci_dev(dev);
2575 if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
2582 static int iommu_should_identity_map(struct device *dev, int startup)
2585 if (dev_is_pci(dev)) {
2586 struct pci_dev *pdev = to_pci_dev(dev);
2588 if (device_is_rmrr_locked(dev))
2591 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2594 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2597 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2601 * We want to start off with all devices in the 1:1 domain, and
2602 * take them out later if we find they can't access all of memory.
2604 * However, we can't do this for PCI devices behind bridges,
2605 * because all PCI devices behind the same bridge will end up
2606 * with the same source-id on their transactions.
2608 * Practically speaking, we can't change things around for these
2609 * devices at run-time, because we can't be sure there'll be no
2610 * DMA transactions in flight for any of their siblings.
2612 * So PCI devices (unless they're on the root bus) as well as
2613 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2614 * the 1:1 domain, just in _case_ one of their siblings turns out
2615 * not to be able to map all of memory.
2617 if (!pci_is_pcie(pdev)) {
2618 if (!pci_is_root_bus(pdev->bus))
2620 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2622 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2625 if (device_has_rmrr(dev))
2630 * At boot time, we don't yet know if devices will be 64-bit capable.
2631 * Assume that they will — if they turn out not to be, then we can
2632 * take them out of the 1:1 domain later.
2636 * If the device's dma_mask is less than the system's memory
2637 * size then this is not a candidate for identity mapping.
2639 u64 dma_mask = *dev->dma_mask;
2641 if (dev->coherent_dma_mask &&
2642 dev->coherent_dma_mask < dma_mask)
2643 dma_mask = dev->coherent_dma_mask;
2645 return dma_mask >= dma_get_required_mask(dev);
2651 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2655 if (!iommu_should_identity_map(dev, 1))
2658 ret = domain_add_dev_info(si_domain, dev,
2659 hw ? CONTEXT_TT_PASS_THROUGH :
2660 CONTEXT_TT_MULTI_LEVEL);
2662 pr_info("IOMMU: %s identity mapping for device %s\n",
2663 hw ? "hardware" : "software", dev_name(dev));
2664 else if (ret == -ENODEV)
2665 /* device not associated with an iommu */
2672 static int __init iommu_prepare_static_identity_mapping(int hw)
2674 struct pci_dev *pdev = NULL;
2675 struct dmar_drhd_unit *drhd;
2676 struct intel_iommu *iommu;
2681 ret = si_domain_init(hw);
2685 for_each_pci_dev(pdev) {
2686 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2691 for_each_active_iommu(iommu, drhd)
2692 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2693 struct acpi_device_physical_node *pn;
2694 struct acpi_device *adev;
2696 if (dev->bus != &acpi_bus_type)
2699 adev= to_acpi_device(dev);
2700 mutex_lock(&adev->physical_node_lock);
2701 list_for_each_entry(pn, &adev->physical_node_list, node) {
2702 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2706 mutex_unlock(&adev->physical_node_lock);
2714 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2717 * Start from the sane iommu hardware state.
2718 * If the queued invalidation is already initialized by us
2719 * (for example, while enabling interrupt-remapping) then
2720 * we got the things already rolling from a sane state.
2724 * Clear any previous faults.
2726 dmar_fault(-1, iommu);
2728 * Disable queued invalidation if supported and already enabled
2729 * before OS handover.
2731 dmar_disable_qi(iommu);
2734 if (dmar_enable_qi(iommu)) {
2736 * Queued Invalidate not enabled, use Register Based Invalidate
2738 iommu->flush.flush_context = __iommu_flush_context;
2739 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2740 pr_info("IOMMU: %s using Register based invalidation\n",
2743 iommu->flush.flush_context = qi_flush_context;
2744 iommu->flush.flush_iotlb = qi_flush_iotlb;
2745 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2749 static int __init init_dmars(void)
2751 struct dmar_drhd_unit *drhd;
2752 struct dmar_rmrr_unit *rmrr;
2754 struct intel_iommu *iommu;
2760 * initialize and program root entry to not present
2763 for_each_drhd_unit(drhd) {
2765 * lock not needed as this is only incremented in the single
2766 * threaded kernel __init code path all other access are read
2769 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2773 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2774 DMAR_UNITS_SUPPORTED);
2777 /* Preallocate enough resources for IOMMU hot-addition */
2778 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2779 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2781 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2784 printk(KERN_ERR "Allocating global iommu array failed\n");
2789 deferred_flush = kzalloc(g_num_of_iommus *
2790 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2791 if (!deferred_flush) {
2796 for_each_active_iommu(iommu, drhd) {
2797 g_iommus[iommu->seq_id] = iommu;
2799 ret = iommu_init_domains(iommu);
2805 * we could share the same root & context tables
2806 * among all IOMMU's. Need to Split it later.
2808 ret = iommu_alloc_root_entry(iommu);
2811 if (!ecap_pass_through(iommu->ecap))
2812 hw_pass_through = 0;
2815 for_each_active_iommu(iommu, drhd)
2816 intel_iommu_init_qi(iommu);
2818 if (iommu_pass_through)
2819 iommu_identity_mapping |= IDENTMAP_ALL;
2821 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2822 iommu_identity_mapping |= IDENTMAP_GFX;
2825 check_tylersburg_isoch();
2828 * If pass through is not set or not enabled, setup context entries for
2829 * identity mappings for rmrr, gfx, and isa and may fall back to static
2830 * identity mapping if iommu_identity_mapping is set.
2832 if (iommu_identity_mapping) {
2833 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2835 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2841 * for each dev attached to rmrr
2843 * locate drhd for dev, alloc domain for dev
2844 * allocate free domain
2845 * allocate page table entries for rmrr
2846 * if context not allocated for bus
2847 * allocate and init context
2848 * set present in root table for this bus
2849 * init context with domain, translation etc
2853 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2854 for_each_rmrr_units(rmrr) {
2855 /* some BIOS lists non-exist devices in DMAR table. */
2856 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2858 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2861 "IOMMU: mapping reserved region failed\n");
2865 iommu_prepare_isa();
2870 * global invalidate context cache
2871 * global invalidate iotlb
2872 * enable translation
2874 for_each_iommu(iommu, drhd) {
2875 if (drhd->ignored) {
2877 * we always have to disable PMRs or DMA may fail on
2881 iommu_disable_protect_mem_regions(iommu);
2885 iommu_flush_write_buffer(iommu);
2887 ret = dmar_set_interrupt(iommu);
2891 iommu_set_root_entry(iommu);
2893 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2894 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2895 iommu_enable_translation(iommu);
2896 iommu_disable_protect_mem_regions(iommu);
2902 for_each_active_iommu(iommu, drhd) {
2903 disable_dmar_iommu(iommu);
2904 free_dmar_iommu(iommu);
2906 kfree(deferred_flush);
2913 /* This takes a number of _MM_ pages, not VTD pages */
2914 static struct iova *intel_alloc_iova(struct device *dev,
2915 struct dmar_domain *domain,
2916 unsigned long nrpages, uint64_t dma_mask)
2918 struct iova *iova = NULL;
2920 /* Restrict dma_mask to the width that the iommu can handle */
2921 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2923 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2925 * First try to allocate an io virtual address in
2926 * DMA_BIT_MASK(32) and if that fails then try allocating
2929 iova = alloc_iova(&domain->iovad, nrpages,
2930 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2934 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2935 if (unlikely(!iova)) {
2936 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2937 nrpages, dev_name(dev));
2944 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2946 struct dmar_domain *domain;
2949 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2951 printk(KERN_ERR "Allocating domain for %s failed",
2956 /* make sure context mapping is ok */
2957 if (unlikely(!domain_context_mapped(dev))) {
2958 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2960 printk(KERN_ERR "Domain context map for %s failed",
2969 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2971 struct device_domain_info *info;
2973 /* No lock here, assumes no domain exit in normal case */
2974 info = dev->archdata.iommu;
2976 return info->domain;
2978 return __get_valid_domain_for_dev(dev);
2981 static int iommu_dummy(struct device *dev)
2983 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2986 /* Check if the dev needs to go through non-identity map and unmap process.*/
2987 static int iommu_no_mapping(struct device *dev)
2991 if (iommu_dummy(dev))
2994 if (!iommu_identity_mapping)
2997 found = identity_mapping(dev);
2999 if (iommu_should_identity_map(dev, 0))
3003 * 32 bit DMA is removed from si_domain and fall back
3004 * to non-identity mapping.
3006 domain_remove_one_dev_info(si_domain, dev);
3007 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3013 * In case of a detached 64 bit DMA device from vm, the device
3014 * is put into si_domain for identity mapping.
3016 if (iommu_should_identity_map(dev, 0)) {
3018 ret = domain_add_dev_info(si_domain, dev,
3020 CONTEXT_TT_PASS_THROUGH :
3021 CONTEXT_TT_MULTI_LEVEL);
3023 printk(KERN_INFO "64bit %s uses identity mapping\n",
3033 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3034 size_t size, int dir, u64 dma_mask)
3036 struct dmar_domain *domain;
3037 phys_addr_t start_paddr;
3041 struct intel_iommu *iommu;
3042 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3044 BUG_ON(dir == DMA_NONE);
3046 if (iommu_no_mapping(dev))
3049 domain = get_valid_domain_for_dev(dev);
3053 iommu = domain_get_iommu(domain);
3054 size = aligned_nrpages(paddr, size);
3056 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3061 * Check if DMAR supports zero-length reads on write only
3064 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3065 !cap_zlr(iommu->cap))
3066 prot |= DMA_PTE_READ;
3067 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3068 prot |= DMA_PTE_WRITE;
3070 * paddr - (paddr + size) might be partial page, we should map the whole
3071 * page. Note: if two part of one page are separately mapped, we
3072 * might have two guest_addr mapping to the same host paddr, but this
3073 * is not a big problem
3075 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3076 mm_to_dma_pfn(paddr_pfn), size, prot);
3080 /* it's a non-present to present mapping. Only flush if caching mode */
3081 if (cap_caching_mode(iommu->cap))
3082 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3084 iommu_flush_write_buffer(iommu);
3086 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3087 start_paddr += paddr & ~PAGE_MASK;
3092 __free_iova(&domain->iovad, iova);
3093 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3094 dev_name(dev), size, (unsigned long long)paddr, dir);
3098 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3099 unsigned long offset, size_t size,
3100 enum dma_data_direction dir,
3101 struct dma_attrs *attrs)
3103 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3104 dir, *dev->dma_mask);
3107 static void flush_unmaps(void)
3113 /* just flush them all */
3114 for (i = 0; i < g_num_of_iommus; i++) {
3115 struct intel_iommu *iommu = g_iommus[i];
3119 if (!deferred_flush[i].next)
3122 /* In caching mode, global flushes turn emulation expensive */
3123 if (!cap_caching_mode(iommu->cap))
3124 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3125 DMA_TLB_GLOBAL_FLUSH);
3126 for (j = 0; j < deferred_flush[i].next; j++) {
3128 struct iova *iova = deferred_flush[i].iova[j];
3129 struct dmar_domain *domain = deferred_flush[i].domain[j];
3131 /* On real hardware multiple invalidations are expensive */
3132 if (cap_caching_mode(iommu->cap))
3133 iommu_flush_iotlb_psi(iommu, domain->id,
3134 iova->pfn_lo, iova_size(iova),
3135 !deferred_flush[i].freelist[j], 0);
3137 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3138 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3139 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3141 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3142 if (deferred_flush[i].freelist[j])
3143 dma_free_pagelist(deferred_flush[i].freelist[j]);
3145 deferred_flush[i].next = 0;
3151 static void flush_unmaps_timeout(unsigned long data)
3153 unsigned long flags;
3155 spin_lock_irqsave(&async_umap_flush_lock, flags);
3157 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3160 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3162 unsigned long flags;
3164 struct intel_iommu *iommu;
3166 spin_lock_irqsave(&async_umap_flush_lock, flags);
3167 if (list_size == HIGH_WATER_MARK)
3170 iommu = domain_get_iommu(dom);
3171 iommu_id = iommu->seq_id;
3173 next = deferred_flush[iommu_id].next;
3174 deferred_flush[iommu_id].domain[next] = dom;
3175 deferred_flush[iommu_id].iova[next] = iova;
3176 deferred_flush[iommu_id].freelist[next] = freelist;
3177 deferred_flush[iommu_id].next++;
3180 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3184 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3187 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3189 struct dmar_domain *domain;
3190 unsigned long start_pfn, last_pfn;
3192 struct intel_iommu *iommu;
3193 struct page *freelist;
3195 if (iommu_no_mapping(dev))
3198 domain = find_domain(dev);
3201 iommu = domain_get_iommu(domain);
3203 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3204 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3205 (unsigned long long)dev_addr))
3208 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3209 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3211 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3212 dev_name(dev), start_pfn, last_pfn);
3214 freelist = domain_unmap(domain, start_pfn, last_pfn);
3216 if (intel_iommu_strict) {
3217 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3218 last_pfn - start_pfn + 1, !freelist, 0);
3220 __free_iova(&domain->iovad, iova);
3221 dma_free_pagelist(freelist);
3223 add_unmap(domain, iova, freelist);
3225 * queue up the release of the unmap to save the 1/6th of the
3226 * cpu used up by the iotlb flush operation...
3231 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3232 size_t size, enum dma_data_direction dir,
3233 struct dma_attrs *attrs)
3235 intel_unmap(dev, dev_addr);
3238 static void *intel_alloc_coherent(struct device *dev, size_t size,
3239 dma_addr_t *dma_handle, gfp_t flags,
3240 struct dma_attrs *attrs)
3242 struct page *page = NULL;
3245 size = PAGE_ALIGN(size);
3246 order = get_order(size);
3248 if (!iommu_no_mapping(dev))
3249 flags &= ~(GFP_DMA | GFP_DMA32);
3250 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3251 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3257 if (flags & __GFP_WAIT) {
3258 unsigned int count = size >> PAGE_SHIFT;
3260 page = dma_alloc_from_contiguous(dev, count, order);
3261 if (page && iommu_no_mapping(dev) &&
3262 page_to_phys(page) + size > dev->coherent_dma_mask) {
3263 dma_release_from_contiguous(dev, page, count);
3269 page = alloc_pages(flags, order);
3272 memset(page_address(page), 0, size);
3274 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3276 dev->coherent_dma_mask);
3278 return page_address(page);
3279 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3280 __free_pages(page, order);
3285 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3286 dma_addr_t dma_handle, struct dma_attrs *attrs)
3289 struct page *page = virt_to_page(vaddr);
3291 size = PAGE_ALIGN(size);
3292 order = get_order(size);
3294 intel_unmap(dev, dma_handle);
3295 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3296 __free_pages(page, order);
3299 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3300 int nelems, enum dma_data_direction dir,
3301 struct dma_attrs *attrs)
3303 intel_unmap(dev, sglist[0].dma_address);
3306 static int intel_nontranslate_map_sg(struct device *hddev,
3307 struct scatterlist *sglist, int nelems, int dir)
3310 struct scatterlist *sg;
3312 for_each_sg(sglist, sg, nelems, i) {
3313 BUG_ON(!sg_page(sg));
3314 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3315 sg->dma_length = sg->length;
3320 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3321 enum dma_data_direction dir, struct dma_attrs *attrs)
3324 struct dmar_domain *domain;
3327 struct iova *iova = NULL;
3329 struct scatterlist *sg;
3330 unsigned long start_vpfn;
3331 struct intel_iommu *iommu;
3333 BUG_ON(dir == DMA_NONE);
3334 if (iommu_no_mapping(dev))
3335 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3337 domain = get_valid_domain_for_dev(dev);
3341 iommu = domain_get_iommu(domain);
3343 for_each_sg(sglist, sg, nelems, i)
3344 size += aligned_nrpages(sg->offset, sg->length);
3346 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3349 sglist->dma_length = 0;
3354 * Check if DMAR supports zero-length reads on write only
3357 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3358 !cap_zlr(iommu->cap))
3359 prot |= DMA_PTE_READ;
3360 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3361 prot |= DMA_PTE_WRITE;
3363 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3365 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3366 if (unlikely(ret)) {
3367 dma_pte_free_pagetable(domain, start_vpfn,
3368 start_vpfn + size - 1);
3369 __free_iova(&domain->iovad, iova);
3373 /* it's a non-present to present mapping. Only flush if caching mode */
3374 if (cap_caching_mode(iommu->cap))
3375 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3377 iommu_flush_write_buffer(iommu);
3382 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3387 struct dma_map_ops intel_dma_ops = {
3388 .alloc = intel_alloc_coherent,
3389 .free = intel_free_coherent,
3390 .map_sg = intel_map_sg,
3391 .unmap_sg = intel_unmap_sg,
3392 .map_page = intel_map_page,
3393 .unmap_page = intel_unmap_page,
3394 .mapping_error = intel_mapping_error,
3397 static inline int iommu_domain_cache_init(void)
3401 iommu_domain_cache = kmem_cache_create("iommu_domain",
3402 sizeof(struct dmar_domain),
3407 if (!iommu_domain_cache) {
3408 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3415 static inline int iommu_devinfo_cache_init(void)
3419 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3420 sizeof(struct device_domain_info),
3424 if (!iommu_devinfo_cache) {
3425 printk(KERN_ERR "Couldn't create devinfo cache\n");
3432 static inline int iommu_iova_cache_init(void)
3436 iommu_iova_cache = kmem_cache_create("iommu_iova",
3437 sizeof(struct iova),
3441 if (!iommu_iova_cache) {
3442 printk(KERN_ERR "Couldn't create iova cache\n");
3449 static int __init iommu_init_mempool(void)
3452 ret = iommu_iova_cache_init();
3456 ret = iommu_domain_cache_init();
3460 ret = iommu_devinfo_cache_init();
3464 kmem_cache_destroy(iommu_domain_cache);
3466 kmem_cache_destroy(iommu_iova_cache);
3471 static void __init iommu_exit_mempool(void)
3473 kmem_cache_destroy(iommu_devinfo_cache);
3474 kmem_cache_destroy(iommu_domain_cache);
3475 kmem_cache_destroy(iommu_iova_cache);
3479 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3481 struct dmar_drhd_unit *drhd;
3485 /* We know that this device on this chipset has its own IOMMU.
3486 * If we find it under a different IOMMU, then the BIOS is lying
3487 * to us. Hope that the IOMMU for this device is actually
3488 * disabled, and it needs no translation...
3490 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3492 /* "can't" happen */
3493 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3496 vtbar &= 0xffff0000;
3498 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3499 drhd = dmar_find_matched_drhd_unit(pdev);
3500 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3501 TAINT_FIRMWARE_WORKAROUND,
3502 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3503 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3505 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3507 static void __init init_no_remapping_devices(void)
3509 struct dmar_drhd_unit *drhd;
3513 for_each_drhd_unit(drhd) {
3514 if (!drhd->include_all) {
3515 for_each_active_dev_scope(drhd->devices,
3516 drhd->devices_cnt, i, dev)
3518 /* ignore DMAR unit if no devices exist */
3519 if (i == drhd->devices_cnt)
3524 for_each_active_drhd_unit(drhd) {
3525 if (drhd->include_all)
3528 for_each_active_dev_scope(drhd->devices,
3529 drhd->devices_cnt, i, dev)
3530 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3532 if (i < drhd->devices_cnt)
3535 /* This IOMMU has *only* gfx devices. Either bypass it or
3536 set the gfx_mapped flag, as appropriate */
3538 intel_iommu_gfx_mapped = 1;
3541 for_each_active_dev_scope(drhd->devices,
3542 drhd->devices_cnt, i, dev)
3543 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3548 #ifdef CONFIG_SUSPEND
3549 static int init_iommu_hw(void)
3551 struct dmar_drhd_unit *drhd;
3552 struct intel_iommu *iommu = NULL;
3554 for_each_active_iommu(iommu, drhd)
3556 dmar_reenable_qi(iommu);
3558 for_each_iommu(iommu, drhd) {
3559 if (drhd->ignored) {
3561 * we always have to disable PMRs or DMA may fail on
3565 iommu_disable_protect_mem_regions(iommu);
3569 iommu_flush_write_buffer(iommu);
3571 iommu_set_root_entry(iommu);
3573 iommu->flush.flush_context(iommu, 0, 0, 0,
3574 DMA_CCMD_GLOBAL_INVL);
3575 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3576 iommu_enable_translation(iommu);
3577 iommu_disable_protect_mem_regions(iommu);
3583 static void iommu_flush_all(void)
3585 struct dmar_drhd_unit *drhd;
3586 struct intel_iommu *iommu;
3588 for_each_active_iommu(iommu, drhd) {
3589 iommu->flush.flush_context(iommu, 0, 0, 0,
3590 DMA_CCMD_GLOBAL_INVL);
3591 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3592 DMA_TLB_GLOBAL_FLUSH);
3596 static int iommu_suspend(void)
3598 struct dmar_drhd_unit *drhd;
3599 struct intel_iommu *iommu = NULL;
3602 for_each_active_iommu(iommu, drhd) {
3603 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3605 if (!iommu->iommu_state)
3611 for_each_active_iommu(iommu, drhd) {
3612 iommu_disable_translation(iommu);
3614 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3616 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3617 readl(iommu->reg + DMAR_FECTL_REG);
3618 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3619 readl(iommu->reg + DMAR_FEDATA_REG);
3620 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3621 readl(iommu->reg + DMAR_FEADDR_REG);
3622 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3623 readl(iommu->reg + DMAR_FEUADDR_REG);
3625 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3630 for_each_active_iommu(iommu, drhd)
3631 kfree(iommu->iommu_state);
3636 static void iommu_resume(void)
3638 struct dmar_drhd_unit *drhd;
3639 struct intel_iommu *iommu = NULL;
3642 if (init_iommu_hw()) {
3644 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3646 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3650 for_each_active_iommu(iommu, drhd) {
3652 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3654 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3655 iommu->reg + DMAR_FECTL_REG);
3656 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3657 iommu->reg + DMAR_FEDATA_REG);
3658 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3659 iommu->reg + DMAR_FEADDR_REG);
3660 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3661 iommu->reg + DMAR_FEUADDR_REG);
3663 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3666 for_each_active_iommu(iommu, drhd)
3667 kfree(iommu->iommu_state);
3670 static struct syscore_ops iommu_syscore_ops = {
3671 .resume = iommu_resume,
3672 .suspend = iommu_suspend,
3675 static void __init init_iommu_pm_ops(void)
3677 register_syscore_ops(&iommu_syscore_ops);
3681 static inline void init_iommu_pm_ops(void) {}
3682 #endif /* CONFIG_PM */
3685 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3687 struct acpi_dmar_reserved_memory *rmrr;
3688 struct dmar_rmrr_unit *rmrru;
3690 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3694 rmrru->hdr = header;
3695 rmrr = (struct acpi_dmar_reserved_memory *)header;
3696 rmrru->base_address = rmrr->base_address;
3697 rmrru->end_address = rmrr->end_address;
3698 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3699 ((void *)rmrr) + rmrr->header.length,
3700 &rmrru->devices_cnt);
3701 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3706 list_add(&rmrru->list, &dmar_rmrr_units);
3711 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3713 struct dmar_atsr_unit *atsru;
3714 struct acpi_dmar_atsr *tmp;
3716 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3717 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3718 if (atsr->segment != tmp->segment)
3720 if (atsr->header.length != tmp->header.length)
3722 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3729 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3731 struct acpi_dmar_atsr *atsr;
3732 struct dmar_atsr_unit *atsru;
3734 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3737 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3738 atsru = dmar_find_atsr(atsr);
3742 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3747 * If memory is allocated from slab by ACPI _DSM method, we need to
3748 * copy the memory content because the memory buffer will be freed
3751 atsru->hdr = (void *)(atsru + 1);
3752 memcpy(atsru->hdr, hdr, hdr->length);
3753 atsru->include_all = atsr->flags & 0x1;
3754 if (!atsru->include_all) {
3755 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3756 (void *)atsr + atsr->header.length,
3757 &atsru->devices_cnt);
3758 if (atsru->devices_cnt && atsru->devices == NULL) {
3764 list_add_rcu(&atsru->list, &dmar_atsr_units);
3769 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3771 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3775 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3777 struct acpi_dmar_atsr *atsr;
3778 struct dmar_atsr_unit *atsru;
3780 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3781 atsru = dmar_find_atsr(atsr);
3783 list_del_rcu(&atsru->list);
3785 intel_iommu_free_atsr(atsru);
3791 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3795 struct acpi_dmar_atsr *atsr;
3796 struct dmar_atsr_unit *atsru;
3798 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3799 atsru = dmar_find_atsr(atsr);
3803 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3804 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3811 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3814 struct intel_iommu *iommu = dmaru->iommu;
3816 if (g_iommus[iommu->seq_id])
3819 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3820 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3824 if (!ecap_sc_support(iommu->ecap) &&
3825 domain_update_iommu_snooping(iommu)) {
3826 pr_warn("IOMMU: %s doesn't support snooping.\n",
3830 sp = domain_update_iommu_superpage(iommu) - 1;
3831 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3832 pr_warn("IOMMU: %s doesn't support large page.\n",
3838 * Disable translation if already enabled prior to OS handover.
3840 if (iommu->gcmd & DMA_GCMD_TE)
3841 iommu_disable_translation(iommu);
3843 g_iommus[iommu->seq_id] = iommu;
3844 ret = iommu_init_domains(iommu);
3846 ret = iommu_alloc_root_entry(iommu);
3850 if (dmaru->ignored) {
3852 * we always have to disable PMRs or DMA may fail on this device
3855 iommu_disable_protect_mem_regions(iommu);
3859 intel_iommu_init_qi(iommu);
3860 iommu_flush_write_buffer(iommu);
3861 ret = dmar_set_interrupt(iommu);
3865 iommu_set_root_entry(iommu);
3866 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3867 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3868 iommu_enable_translation(iommu);
3871 ret = iommu_attach_domain(si_domain, iommu);
3872 if (ret < 0 || si_domain->id != ret)
3874 domain_attach_iommu(si_domain, iommu);
3877 iommu_disable_protect_mem_regions(iommu);
3881 disable_dmar_iommu(iommu);
3883 free_dmar_iommu(iommu);
3887 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3890 struct intel_iommu *iommu = dmaru->iommu;
3892 if (!intel_iommu_enabled)
3898 ret = intel_iommu_add(dmaru);
3900 disable_dmar_iommu(iommu);
3901 free_dmar_iommu(iommu);
3907 static void intel_iommu_free_dmars(void)
3909 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3910 struct dmar_atsr_unit *atsru, *atsr_n;
3912 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3913 list_del(&rmrru->list);
3914 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3918 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3919 list_del(&atsru->list);
3920 intel_iommu_free_atsr(atsru);
3924 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3927 struct pci_bus *bus;
3928 struct pci_dev *bridge = NULL;
3930 struct acpi_dmar_atsr *atsr;
3931 struct dmar_atsr_unit *atsru;
3933 dev = pci_physfn(dev);
3934 for (bus = dev->bus; bus; bus = bus->parent) {
3936 if (!bridge || !pci_is_pcie(bridge) ||
3937 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3939 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3946 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3947 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3948 if (atsr->segment != pci_domain_nr(dev->bus))
3951 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3952 if (tmp == &bridge->dev)
3955 if (atsru->include_all)
3965 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3968 struct dmar_rmrr_unit *rmrru;
3969 struct dmar_atsr_unit *atsru;
3970 struct acpi_dmar_atsr *atsr;
3971 struct acpi_dmar_reserved_memory *rmrr;
3973 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3976 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3977 rmrr = container_of(rmrru->hdr,
3978 struct acpi_dmar_reserved_memory, header);
3979 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3980 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3981 ((void *)rmrr) + rmrr->header.length,
3982 rmrr->segment, rmrru->devices,
3983 rmrru->devices_cnt);
3986 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3987 dmar_remove_dev_scope(info, rmrr->segment,
3988 rmrru->devices, rmrru->devices_cnt);
3992 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3993 if (atsru->include_all)
3996 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3997 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3998 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3999 (void *)atsr + atsr->header.length,
4000 atsr->segment, atsru->devices,
4001 atsru->devices_cnt);
4006 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4007 if (dmar_remove_dev_scope(info, atsr->segment,
4008 atsru->devices, atsru->devices_cnt))
4017 * Here we only respond to action of unbound device from driver.
4019 * Added device is not attached to its DMAR domain here yet. That will happen
4020 * when mapping the device to iova.
4022 static int device_notifier(struct notifier_block *nb,
4023 unsigned long action, void *data)
4025 struct device *dev = data;
4026 struct dmar_domain *domain;
4028 if (iommu_dummy(dev))
4031 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4035 * If the device is still attached to a device driver we can't
4036 * tear down the domain yet as DMA mappings may still be in use.
4037 * Wait for the BUS_NOTIFY_UNBOUND_DRIVER event to do that.
4039 if (action == BUS_NOTIFY_DEL_DEVICE && dev->driver != NULL)
4042 domain = find_domain(dev);
4046 down_read(&dmar_global_lock);
4047 domain_remove_one_dev_info(domain, dev);
4048 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4049 domain_exit(domain);
4050 up_read(&dmar_global_lock);
4055 static struct notifier_block device_nb = {
4056 .notifier_call = device_notifier,
4059 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4060 unsigned long val, void *v)
4062 struct memory_notify *mhp = v;
4063 unsigned long long start, end;
4064 unsigned long start_vpfn, last_vpfn;
4067 case MEM_GOING_ONLINE:
4068 start = mhp->start_pfn << PAGE_SHIFT;
4069 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4070 if (iommu_domain_identity_map(si_domain, start, end)) {
4071 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4078 case MEM_CANCEL_ONLINE:
4079 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4080 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4081 while (start_vpfn <= last_vpfn) {
4083 struct dmar_drhd_unit *drhd;
4084 struct intel_iommu *iommu;
4085 struct page *freelist;
4087 iova = find_iova(&si_domain->iovad, start_vpfn);
4089 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4094 iova = split_and_remove_iova(&si_domain->iovad, iova,
4095 start_vpfn, last_vpfn);
4097 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4098 start_vpfn, last_vpfn);
4102 freelist = domain_unmap(si_domain, iova->pfn_lo,
4106 for_each_active_iommu(iommu, drhd)
4107 iommu_flush_iotlb_psi(iommu, si_domain->id,
4108 iova->pfn_lo, iova_size(iova),
4111 dma_free_pagelist(freelist);
4113 start_vpfn = iova->pfn_hi + 1;
4114 free_iova_mem(iova);
4122 static struct notifier_block intel_iommu_memory_nb = {
4123 .notifier_call = intel_iommu_memory_notifier,
4128 static ssize_t intel_iommu_show_version(struct device *dev,
4129 struct device_attribute *attr,
4132 struct intel_iommu *iommu = dev_get_drvdata(dev);
4133 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4134 return sprintf(buf, "%d:%d\n",
4135 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4137 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4139 static ssize_t intel_iommu_show_address(struct device *dev,
4140 struct device_attribute *attr,
4143 struct intel_iommu *iommu = dev_get_drvdata(dev);
4144 return sprintf(buf, "%llx\n", iommu->reg_phys);
4146 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4148 static ssize_t intel_iommu_show_cap(struct device *dev,
4149 struct device_attribute *attr,
4152 struct intel_iommu *iommu = dev_get_drvdata(dev);
4153 return sprintf(buf, "%llx\n", iommu->cap);
4155 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4157 static ssize_t intel_iommu_show_ecap(struct device *dev,
4158 struct device_attribute *attr,
4161 struct intel_iommu *iommu = dev_get_drvdata(dev);
4162 return sprintf(buf, "%llx\n", iommu->ecap);
4164 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4166 static struct attribute *intel_iommu_attrs[] = {
4167 &dev_attr_version.attr,
4168 &dev_attr_address.attr,
4170 &dev_attr_ecap.attr,
4174 static struct attribute_group intel_iommu_group = {
4175 .name = "intel-iommu",
4176 .attrs = intel_iommu_attrs,
4179 const struct attribute_group *intel_iommu_groups[] = {
4184 int __init intel_iommu_init(void)
4187 struct dmar_drhd_unit *drhd;
4188 struct intel_iommu *iommu;
4190 /* VT-d is required for a TXT/tboot launch, so enforce that */
4191 force_on = tboot_force_iommu();
4193 if (iommu_init_mempool()) {
4195 panic("tboot: Failed to initialize iommu memory\n");
4199 down_write(&dmar_global_lock);
4200 if (dmar_table_init()) {
4202 panic("tboot: Failed to initialize DMAR table\n");
4207 * Disable translation if already enabled prior to OS handover.
4209 for_each_active_iommu(iommu, drhd)
4210 if (iommu->gcmd & DMA_GCMD_TE)
4211 iommu_disable_translation(iommu);
4213 if (dmar_dev_scope_init() < 0) {
4215 panic("tboot: Failed to initialize DMAR device scope\n");
4219 if (no_iommu || dmar_disabled)
4222 if (list_empty(&dmar_rmrr_units))
4223 printk(KERN_INFO "DMAR: No RMRR found\n");
4225 if (list_empty(&dmar_atsr_units))
4226 printk(KERN_INFO "DMAR: No ATSR found\n");
4228 if (dmar_init_reserved_ranges()) {
4230 panic("tboot: Failed to reserve iommu ranges\n");
4231 goto out_free_reserved_range;
4234 init_no_remapping_devices();
4239 panic("tboot: Failed to initialize DMARs\n");
4240 printk(KERN_ERR "IOMMU: dmar init failed\n");
4241 goto out_free_reserved_range;
4243 up_write(&dmar_global_lock);
4245 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4247 init_timer(&unmap_timer);
4248 #ifdef CONFIG_SWIOTLB
4251 dma_ops = &intel_dma_ops;
4253 init_iommu_pm_ops();
4255 for_each_active_iommu(iommu, drhd)
4256 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4260 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4261 bus_register_notifier(&pci_bus_type, &device_nb);
4262 if (si_domain && !hw_pass_through)
4263 register_memory_notifier(&intel_iommu_memory_nb);
4265 intel_iommu_enabled = 1;
4269 out_free_reserved_range:
4270 put_iova_domain(&reserved_iova_list);
4272 intel_iommu_free_dmars();
4273 up_write(&dmar_global_lock);
4274 iommu_exit_mempool();
4278 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4280 struct intel_iommu *iommu = opaque;
4282 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4287 * NB - intel-iommu lacks any sort of reference counting for the users of
4288 * dependent devices. If multiple endpoints have intersecting dependent
4289 * devices, unbinding the driver from any one of them will possibly leave
4290 * the others unable to operate.
4292 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4295 if (!iommu || !dev || !dev_is_pci(dev))
4298 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4301 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4304 struct device_domain_info *info, *tmp;
4305 struct intel_iommu *iommu;
4306 unsigned long flags;
4310 iommu = device_to_iommu(dev, &bus, &devfn);
4314 spin_lock_irqsave(&device_domain_lock, flags);
4315 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4316 if (info->iommu == iommu && info->bus == bus &&
4317 info->devfn == devfn) {
4318 unlink_domain_info(info);
4319 spin_unlock_irqrestore(&device_domain_lock, flags);
4321 iommu_disable_dev_iotlb(info);
4322 iommu_detach_dev(iommu, info->bus, info->devfn);
4323 iommu_detach_dependent_devices(iommu, dev);
4324 free_devinfo_mem(info);
4326 spin_lock_irqsave(&device_domain_lock, flags);
4334 /* if there is no other devices under the same iommu
4335 * owned by this domain, clear this iommu in iommu_bmp
4336 * update iommu count and coherency
4338 if (info->iommu == iommu)
4342 spin_unlock_irqrestore(&device_domain_lock, flags);
4345 domain_detach_iommu(domain, iommu);
4346 if (!domain_type_is_vm_or_si(domain))
4347 iommu_detach_domain(domain, iommu);
4351 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4355 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4356 domain_reserve_special_ranges(domain);
4358 /* calculate AGAW */
4359 domain->gaw = guest_width;
4360 adjust_width = guestwidth_to_adjustwidth(guest_width);
4361 domain->agaw = width_to_agaw(adjust_width);
4363 domain->iommu_coherency = 0;
4364 domain->iommu_snooping = 0;
4365 domain->iommu_superpage = 0;
4366 domain->max_addr = 0;
4368 /* always allocate the top pgd */
4369 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4372 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4376 static int intel_iommu_domain_init(struct iommu_domain *domain)
4378 struct dmar_domain *dmar_domain;
4380 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4383 "intel_iommu_domain_init: dmar_domain == NULL\n");
4386 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4388 "intel_iommu_domain_init() failed\n");
4389 domain_exit(dmar_domain);
4392 domain_update_iommu_cap(dmar_domain);
4393 domain->priv = dmar_domain;
4395 domain->geometry.aperture_start = 0;
4396 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4397 domain->geometry.force_aperture = true;
4402 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4404 struct dmar_domain *dmar_domain = domain->priv;
4406 domain->priv = NULL;
4407 domain_exit(dmar_domain);
4410 static int intel_iommu_attach_device(struct iommu_domain *domain,
4413 struct dmar_domain *dmar_domain = domain->priv;
4414 struct intel_iommu *iommu;
4418 if (device_is_rmrr_locked(dev)) {
4419 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4423 /* normally dev is not mapped */
4424 if (unlikely(domain_context_mapped(dev))) {
4425 struct dmar_domain *old_domain;
4427 old_domain = find_domain(dev);
4429 if (domain_type_is_vm_or_si(dmar_domain))
4430 domain_remove_one_dev_info(old_domain, dev);
4432 domain_remove_dev_info(old_domain);
4436 iommu = device_to_iommu(dev, &bus, &devfn);
4440 /* check if this iommu agaw is sufficient for max mapped address */
4441 addr_width = agaw_to_width(iommu->agaw);
4442 if (addr_width > cap_mgaw(iommu->cap))
4443 addr_width = cap_mgaw(iommu->cap);
4445 if (dmar_domain->max_addr > (1LL << addr_width)) {
4446 printk(KERN_ERR "%s: iommu width (%d) is not "
4447 "sufficient for the mapped address (%llx)\n",
4448 __func__, addr_width, dmar_domain->max_addr);
4451 dmar_domain->gaw = addr_width;
4454 * Knock out extra levels of page tables if necessary
4456 while (iommu->agaw < dmar_domain->agaw) {
4457 struct dma_pte *pte;
4459 pte = dmar_domain->pgd;
4460 if (dma_pte_present(pte)) {
4461 dmar_domain->pgd = (struct dma_pte *)
4462 phys_to_virt(dma_pte_addr(pte));
4463 free_pgtable_page(pte);
4465 dmar_domain->agaw--;
4468 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4471 static void intel_iommu_detach_device(struct iommu_domain *domain,
4474 struct dmar_domain *dmar_domain = domain->priv;
4476 domain_remove_one_dev_info(dmar_domain, dev);
4479 static int intel_iommu_map(struct iommu_domain *domain,
4480 unsigned long iova, phys_addr_t hpa,
4481 size_t size, int iommu_prot)
4483 struct dmar_domain *dmar_domain = domain->priv;
4488 if (iommu_prot & IOMMU_READ)
4489 prot |= DMA_PTE_READ;
4490 if (iommu_prot & IOMMU_WRITE)
4491 prot |= DMA_PTE_WRITE;
4492 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4493 prot |= DMA_PTE_SNP;
4495 max_addr = iova + size;
4496 if (dmar_domain->max_addr < max_addr) {
4499 /* check if minimum agaw is sufficient for mapped address */
4500 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4501 if (end < max_addr) {
4502 printk(KERN_ERR "%s: iommu width (%d) is not "
4503 "sufficient for the mapped address (%llx)\n",
4504 __func__, dmar_domain->gaw, max_addr);
4507 dmar_domain->max_addr = max_addr;
4509 /* Round up size to next multiple of PAGE_SIZE, if it and
4510 the low bits of hpa would take us onto the next page */
4511 size = aligned_nrpages(hpa, size);
4512 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4513 hpa >> VTD_PAGE_SHIFT, size, prot);
4517 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4518 unsigned long iova, size_t size)
4520 struct dmar_domain *dmar_domain = domain->priv;
4521 struct page *freelist = NULL;
4522 struct intel_iommu *iommu;
4523 unsigned long start_pfn, last_pfn;
4524 unsigned int npages;
4525 int iommu_id, num, ndomains, level = 0;
4527 /* Cope with horrid API which requires us to unmap more than the
4528 size argument if it happens to be a large-page mapping. */
4529 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4532 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4533 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4535 start_pfn = iova >> VTD_PAGE_SHIFT;
4536 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4538 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4540 npages = last_pfn - start_pfn + 1;
4542 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4543 iommu = g_iommus[iommu_id];
4546 * find bit position of dmar_domain
4548 ndomains = cap_ndoms(iommu->cap);
4549 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4550 if (iommu->domains[num] == dmar_domain)
4551 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4552 npages, !freelist, 0);
4557 dma_free_pagelist(freelist);
4559 if (dmar_domain->max_addr == iova + size)
4560 dmar_domain->max_addr = iova;
4565 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4568 struct dmar_domain *dmar_domain = domain->priv;
4569 struct dma_pte *pte;
4573 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4575 phys = dma_pte_addr(pte);
4580 static bool intel_iommu_capable(enum iommu_cap cap)
4582 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4583 return domain_update_iommu_snooping(NULL) == 1;
4584 if (cap == IOMMU_CAP_INTR_REMAP)
4585 return irq_remapping_enabled == 1;
4590 static int intel_iommu_add_device(struct device *dev)
4592 struct intel_iommu *iommu;
4593 struct iommu_group *group;
4596 iommu = device_to_iommu(dev, &bus, &devfn);
4600 iommu_device_link(iommu->iommu_dev, dev);
4602 group = iommu_group_get_for_dev(dev);
4605 return PTR_ERR(group);
4607 iommu_group_put(group);
4611 static void intel_iommu_remove_device(struct device *dev)
4613 struct intel_iommu *iommu;
4616 iommu = device_to_iommu(dev, &bus, &devfn);
4620 iommu_group_remove_device(dev);
4622 iommu_device_unlink(iommu->iommu_dev, dev);
4625 static const struct iommu_ops intel_iommu_ops = {
4626 .capable = intel_iommu_capable,
4627 .domain_init = intel_iommu_domain_init,
4628 .domain_destroy = intel_iommu_domain_destroy,
4629 .attach_dev = intel_iommu_attach_device,
4630 .detach_dev = intel_iommu_detach_device,
4631 .map = intel_iommu_map,
4632 .unmap = intel_iommu_unmap,
4633 .map_sg = default_iommu_map_sg,
4634 .iova_to_phys = intel_iommu_iova_to_phys,
4635 .add_device = intel_iommu_add_device,
4636 .remove_device = intel_iommu_remove_device,
4637 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4640 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4642 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4643 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4655 static void quirk_iommu_rwbf(struct pci_dev *dev)
4658 * Mobile 4 Series Chipset neglects to set RWBF capability,
4659 * but needs it. Same seems to hold for the desktop versions.
4661 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4674 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4675 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4676 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4677 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4678 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4679 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4680 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4681 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4683 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4687 if (pci_read_config_word(dev, GGC, &ggc))
4690 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4691 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4693 } else if (dmar_map_gfx) {
4694 /* we have to ensure the gfx device is idle before we flush */
4695 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4696 intel_iommu_strict = 1;
4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4704 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4705 ISOCH DMAR unit for the Azalia sound device, but not give it any
4706 TLB entries, which causes it to deadlock. Check for that. We do
4707 this in a function called from init_dmars(), instead of in a PCI
4708 quirk, because we don't want to print the obnoxious "BIOS broken"
4709 message if VT-d is actually disabled.
4711 static void __init check_tylersburg_isoch(void)
4713 struct pci_dev *pdev;
4714 uint32_t vtisochctrl;
4716 /* If there's no Azalia in the system anyway, forget it. */
4717 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4722 /* System Management Registers. Might be hidden, in which case
4723 we can't do the sanity check. But that's OK, because the
4724 known-broken BIOSes _don't_ actually hide it, so far. */
4725 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4729 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4736 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4737 if (vtisochctrl & 1)
4740 /* Drop all bits other than the number of TLB entries */
4741 vtisochctrl &= 0x1c;
4743 /* If we have the recommended number of TLB entries (16), fine. */
4744 if (vtisochctrl == 0x10)
4747 /* Zero TLB entries? You get to ride the short bus to school. */
4749 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4750 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4751 dmi_get_system_info(DMI_BIOS_VENDOR),
4752 dmi_get_system_info(DMI_BIOS_VERSION),
4753 dmi_get_system_info(DMI_PRODUCT_VERSION));
4754 iommu_identity_mapping |= IDENTMAP_AZALIA;
4758 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",