2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
38 #include <asm/fixmap.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
45 #include <asm/sections.h>
51 const struct dma_mapping_ops* dma_ops;
52 EXPORT_SYMBOL(dma_ops);
54 static unsigned long dma_reserve __initdata;
56 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
66 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0;
71 printk(KERN_INFO "Mem-info:\n");
73 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
75 for_each_online_pgdat(pgdat) {
76 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82 page = pfn_to_page(pgdat->node_start_pfn + i);
84 if (PageReserved(page))
86 else if (PageSwapCache(page))
88 else if (page_count(page))
89 shared += page_count(page) - 1;
92 printk(KERN_INFO "%lu pages of RAM\n", total);
93 printk(KERN_INFO "%lu reserved pages\n",reserved);
94 printk(KERN_INFO "%lu pages shared\n",shared);
95 printk(KERN_INFO "%lu pages swap cached\n",cached);
100 static __init void *spp_getpage(void)
104 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
106 ptr = alloc_bootmem_pages(PAGE_SIZE);
107 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
108 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
110 Dprintk("spp_getpage %p\n", ptr);
114 static __init void set_pte_phys(unsigned long vaddr,
115 unsigned long phys, pgprot_t prot)
122 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
124 pgd = pgd_offset_k(vaddr);
125 if (pgd_none(*pgd)) {
126 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 pud = pud_offset(pgd, vaddr);
130 if (pud_none(*pud)) {
131 pmd = (pmd_t *) spp_getpage();
132 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
133 if (pmd != pmd_offset(pud, 0)) {
134 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
138 pmd = pmd_offset(pud, vaddr);
139 if (pmd_none(*pmd)) {
140 pte = (pte_t *) spp_getpage();
141 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
142 if (pte != pte_offset_kernel(pmd, 0)) {
143 printk("PAGETABLE BUG #02!\n");
147 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
149 pte = pte_offset_kernel(pmd, vaddr);
150 if (!pte_none(*pte) &&
151 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
153 set_pte(pte, new_pte);
156 * It's enough to flush this one mapping.
157 * (PGE mappings get flushed as well)
159 __flush_tlb_one(vaddr);
162 /* NOTE: this is meant to be run only at boot */
164 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
166 unsigned long address = __fix_to_virt(idx);
168 if (idx >= __end_of_fixed_addresses) {
169 printk("Invalid __set_fixmap\n");
172 set_pte_phys(address, phys, prot);
175 unsigned long __initdata table_start, table_end;
177 static __meminit void *alloc_low_page(unsigned long *phys)
179 unsigned long pfn = table_end++;
183 adr = (void *)get_zeroed_page(GFP_ATOMIC);
189 panic("alloc_low_page: ran out of memory");
191 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
192 memset(adr, 0, PAGE_SIZE);
193 *phys = pfn * PAGE_SIZE;
197 static __meminit void unmap_low_page(void *adr)
203 early_iounmap(adr, PAGE_SIZE);
206 /* Must run before zap_low_mappings */
207 __init void *early_ioremap(unsigned long addr, unsigned long size)
210 pmd_t *pmd, *last_pmd;
213 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
214 vaddr = __START_KERNEL_map;
215 pmd = level2_kernel_pgt;
216 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
217 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
218 for (i = 0; i < pmds; i++) {
219 if (pmd_present(pmd[i]))
222 vaddr += addr & ~PMD_MASK;
224 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
225 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
227 return (void *)vaddr;
231 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
235 /* To avoid virtual aliases later */
236 __init void early_iounmap(void *addr, unsigned long size)
242 vaddr = (unsigned long)addr;
243 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
244 pmd = level2_kernel_pgt + pmd_index(vaddr);
245 for (i = 0; i < pmds; i++)
250 static void __meminit
251 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
253 int i = pmd_index(address);
255 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
257 pmd_t *pmd = pmd_page + pmd_index(address);
259 if (address >= end) {
261 for (; i < PTRS_PER_PMD; i++, pmd++)
262 set_pmd(pmd, __pmd(0));
269 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
270 entry &= __supported_pte_mask;
271 set_pmd(pmd, __pmd(entry));
275 static void __meminit
276 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
278 pmd_t *pmd = pmd_offset(pud,0);
279 spin_lock(&init_mm.page_table_lock);
280 phys_pmd_init(pmd, address, end);
281 spin_unlock(&init_mm.page_table_lock);
285 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
287 int i = pud_index(addr);
290 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
291 unsigned long pmd_phys;
292 pud_t *pud = pud_page + pud_index(addr);
298 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
299 set_pud(pud, __pud(0));
304 phys_pmd_update(pud, addr, end);
308 pmd = alloc_low_page(&pmd_phys);
309 spin_lock(&init_mm.page_table_lock);
310 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
311 phys_pmd_init(pmd, addr, end);
312 spin_unlock(&init_mm.page_table_lock);
318 static void __init find_early_table_space(unsigned long end)
320 unsigned long puds, pmds, tables, start;
322 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
323 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
324 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
325 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
327 /* RED-PEN putting page tables only on node 0 could
328 cause a hotspot and fill up ZONE_DMA. The page tables
329 need roughly 0.5KB per GB. */
331 table_start = find_e820_area(start, end, tables);
332 if (table_start == -1UL)
333 panic("Cannot find space for the kernel page tables");
335 table_start >>= PAGE_SHIFT;
336 table_end = table_start;
338 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
339 end, table_start << PAGE_SHIFT,
340 (table_start << PAGE_SHIFT) + tables);
343 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
344 This runs before bootmem is initialized and gets pages directly from the
345 physical memory. To access them they are temporarily mapped. */
346 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
350 Dprintk("init_memory_mapping\n");
353 * Find space for the kernel direct mapping tables.
354 * Later we should allocate these tables in the local node of the memory
355 * mapped. Unfortunately this is done currently before the nodes are
359 find_early_table_space(end);
361 start = (unsigned long)__va(start);
362 end = (unsigned long)__va(end);
364 for (; start < end; start = next) {
365 unsigned long pud_phys;
366 pgd_t *pgd = pgd_offset_k(start);
370 pud = pud_offset(pgd, start & PGDIR_MASK);
372 pud = alloc_low_page(&pud_phys);
374 next = start + PGDIR_SIZE;
377 phys_pud_init(pud, __pa(start), __pa(next));
379 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
384 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
389 void __init paging_init(void)
391 unsigned long max_zone_pfns[MAX_NR_ZONES];
392 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
393 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
394 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
395 max_zone_pfns[ZONE_NORMAL] = end_pfn;
397 memory_present(0, 0, end_pfn);
399 free_area_init_nodes(max_zone_pfns);
403 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
404 from the CPU leading to inconsistent cache lines. address and size
405 must be aligned to 2MB boundaries.
406 Does nothing when the mapping doesn't exist. */
407 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
409 unsigned long end = address + size;
411 BUG_ON(address & ~LARGE_PAGE_MASK);
412 BUG_ON(size & ~LARGE_PAGE_MASK);
414 for (; address < end; address += LARGE_PAGE_SIZE) {
415 pgd_t *pgd = pgd_offset_k(address);
420 pud = pud_offset(pgd, address);
423 pmd = pmd_offset(pud, address);
424 if (!pmd || pmd_none(*pmd))
426 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
427 /* Could handle this, but it should not happen currently. */
429 "clear_kernel_mapping: mapping has been split. will leak memory\n");
432 set_pmd(pmd, __pmd(0));
438 * Memory hotplug specific functions
440 void online_page(struct page *page)
442 ClearPageReserved(page);
443 init_page_count(page);
449 #ifdef CONFIG_MEMORY_HOTPLUG
451 * Memory is added always to NORMAL zone. This means you will never get
452 * additional DMA/DMA32 memory.
454 int arch_add_memory(int nid, u64 start, u64 size)
456 struct pglist_data *pgdat = NODE_DATA(nid);
457 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
458 unsigned long start_pfn = start >> PAGE_SHIFT;
459 unsigned long nr_pages = size >> PAGE_SHIFT;
462 init_memory_mapping(start, (start + size -1));
464 ret = __add_pages(zone, start_pfn, nr_pages);
470 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 EXPORT_SYMBOL_GPL(arch_add_memory);
475 int remove_memory(u64 start, u64 size)
479 EXPORT_SYMBOL_GPL(remove_memory);
481 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
482 int memory_add_physaddr_to_nid(u64 start)
486 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
489 #endif /* CONFIG_MEMORY_HOTPLUG */
491 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
493 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
494 * just online the pages.
496 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
500 unsigned long total = 0, mem = 0;
501 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
502 if (pfn_valid(pfn)) {
503 online_page(pfn_to_page(pfn));
510 z->spanned_pages += total;
511 z->present_pages += mem;
512 z->zone_pgdat->node_spanned_pages += total;
513 z->zone_pgdat->node_present_pages += mem;
519 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
522 void __init mem_init(void)
524 long codesize, reservedpages, datasize, initsize;
528 /* clear the zero-page */
529 memset(empty_zero_page, 0, PAGE_SIZE);
533 /* this will put all low memory onto the freelists */
535 totalram_pages = numa_free_all_bootmem();
537 totalram_pages = free_all_bootmem();
539 reservedpages = end_pfn - totalram_pages -
540 absent_pages_in_range(0, end_pfn);
544 codesize = (unsigned long) &_etext - (unsigned long) &_text;
545 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
546 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
548 /* Register memory areas for /proc/kcore */
549 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
550 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
551 VMALLOC_END-VMALLOC_START);
552 kclist_add(&kcore_kernel, &_stext, _end - _stext);
553 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
554 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
555 VSYSCALL_END - VSYSCALL_START);
557 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
558 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
559 end_pfn << (PAGE_SHIFT-10),
561 reservedpages << (PAGE_SHIFT-10),
566 void free_init_pages(char *what, unsigned long begin, unsigned long end)
573 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
574 for (addr = begin; addr < end; addr += PAGE_SIZE) {
575 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
576 ClearPageReserved(page);
577 init_page_count(page);
578 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
579 if (addr >= __START_KERNEL_map)
580 change_page_attr_addr(addr, 1, __pgprot(0));
584 if (addr > __START_KERNEL_map)
588 void free_initmem(void)
590 free_init_pages("unused kernel memory",
591 __pa_symbol(&__init_begin),
592 __pa_symbol(&__init_end));
595 #ifdef CONFIG_DEBUG_RODATA
597 void mark_rodata_ro(void)
599 unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
601 #ifdef CONFIG_HOTPLUG_CPU
602 /* It must still be possible to apply SMP alternatives. */
603 if (num_possible_cpus() > 1)
604 start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
606 size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
607 change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
609 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
613 * change_page_attr_addr() requires a global_flush_tlb() call after it.
614 * We do this after the printk so that if something went wrong in the
615 * change, the printk gets out at least to give a better debug hint
616 * of who is the culprit.
622 #ifdef CONFIG_BLK_DEV_INITRD
623 void free_initrd_mem(unsigned long start, unsigned long end)
625 free_init_pages("initrd memory", __pa(start), __pa(end));
629 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
632 int nid = phys_to_nid(phys);
634 unsigned long pfn = phys >> PAGE_SHIFT;
635 if (pfn >= end_pfn) {
636 /* This can happen with kdump kernels when accessing firmware
638 if (pfn < end_pfn_map)
640 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
645 /* Should check here against the e820 map to avoid double free */
647 reserve_bootmem_node(NODE_DATA(nid), phys, len);
649 reserve_bootmem(phys, len);
651 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
652 dma_reserve += len / PAGE_SIZE;
653 set_dma_reserve(dma_reserve);
657 int kern_addr_valid(unsigned long addr)
659 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
665 if (above != 0 && above != -1UL)
668 pgd = pgd_offset_k(addr);
672 pud = pud_offset(pgd, addr);
676 pmd = pmd_offset(pud, addr);
680 return pfn_valid(pmd_pfn(*pmd));
682 pte = pte_offset_kernel(pmd, addr);
685 return pfn_valid(pte_pfn(*pte));
689 #include <linux/sysctl.h>
691 extern int exception_trace, page_fault_trace;
693 static ctl_table debug_table2[] = {
696 .procname = "exception-trace",
697 .data = &exception_trace,
698 .maxlen = sizeof(int),
700 .proc_handler = proc_dointvec
705 static ctl_table debug_root_table2[] = {
707 .ctl_name = CTL_DEBUG,
710 .child = debug_table2
715 static __init int x8664_sysctl_init(void)
717 register_sysctl_table(debug_root_table2);
720 __initcall(x8664_sysctl_init);
723 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
724 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
725 not need special handling anymore. */
727 static struct vm_area_struct gate_vma = {
728 .vm_start = VSYSCALL_START,
729 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
730 .vm_page_prot = PAGE_READONLY_EXEC,
731 .vm_flags = VM_READ | VM_EXEC
734 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
736 #ifdef CONFIG_IA32_EMULATION
737 if (test_tsk_thread_flag(tsk, TIF_IA32))
743 int in_gate_area(struct task_struct *task, unsigned long addr)
745 struct vm_area_struct *vma = get_gate_vma(task);
748 return (addr >= vma->vm_start) && (addr < vma->vm_end);
751 /* Use this when you have no reliable task/vma, typically from interrupt
752 * context. It is less reliable than using the task's vma and may give
755 int in_gate_area_no_task(unsigned long addr)
757 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);