2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Do sanity checking on a policy */
97 static int mpol_check_policy(int mode, nodemask_t *nodes)
99 int empty = nodes_empty(*nodes);
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
117 /* Copy a node mask from user space. */
118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
122 unsigned long nlongs;
123 unsigned long endmask;
127 if (maxnode == 0 || !nmask)
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
143 if (get_user(t, nmask + k))
145 if (k == nlongs - 1) {
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
177 for_each_node_mask(nd, *nodes) {
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
183 zl->zones[num++] = z;
188 zl->zones[num] = NULL;
192 /* Create a new policy */
193 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
195 struct mempolicy *policy;
197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
198 if (mode == MPOL_DEFAULT)
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
205 case MPOL_INTERLEAVE:
206 policy->v.nodes = *nodes;
209 policy->v.preferred_node = first_node(*nodes);
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
221 policy->policy = mode;
225 /* Ensure all existing pages follow the policy. */
226 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
227 unsigned long addr, unsigned long end, nodemask_t *nodes)
233 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
238 if (!pte_present(*pte))
241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
245 nid = pfn_to_nid(pfn);
246 if (!node_isset(nid, *nodes))
248 } while (pte++, addr += PAGE_SIZE, addr != end);
249 pte_unmap_unlock(orig_pte, ptl);
253 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
254 unsigned long addr, unsigned long end, nodemask_t *nodes)
259 pmd = pmd_offset(pud, addr);
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
264 if (check_pte_range(vma, pmd, addr, next, nodes))
266 } while (pmd++, addr = next, addr != end);
270 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
271 unsigned long addr, unsigned long end, nodemask_t *nodes)
276 pud = pud_offset(pgd, addr);
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
281 if (check_pmd_range(vma, pud, addr, next, nodes))
283 } while (pud++, addr = next, addr != end);
287 static inline int check_pgd_range(struct vm_area_struct *vma,
288 unsigned long addr, unsigned long end, nodemask_t *nodes)
293 pgd = pgd_offset(vma->vm_mm, addr);
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
298 if (check_pud_range(vma, pgd, addr, next, nodes))
300 } while (pgd++, addr = next, addr != end);
304 /* Step 1: check the range */
305 static struct vm_area_struct *
306 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
307 nodemask_t *nodes, unsigned long flags)
310 struct vm_area_struct *first, *vma, *prev;
312 first = find_vma(mm, start);
314 return ERR_PTR(-EFAULT);
315 if (first->vm_flags & VM_RESERVED)
316 return ERR_PTR(-EACCES);
318 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
319 if (!vma->vm_next && vma->vm_end < end)
320 return ERR_PTR(-EFAULT);
321 if (prev && prev->vm_end < vma->vm_start)
322 return ERR_PTR(-EFAULT);
323 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
324 unsigned long endvma = vma->vm_end;
327 if (vma->vm_start > start)
328 start = vma->vm_start;
329 err = check_pgd_range(vma, start, endvma, nodes);
331 first = ERR_PTR(err);
340 /* Apply policy to a single VMA */
341 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
344 struct mempolicy *old = vma->vm_policy;
346 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
347 vma->vm_start, vma->vm_end, vma->vm_pgoff,
348 vma->vm_ops, vma->vm_file,
349 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
351 if (vma->vm_ops && vma->vm_ops->set_policy)
352 err = vma->vm_ops->set_policy(vma, new);
355 vma->vm_policy = new;
361 /* Step 2: apply policy to a range and do splits. */
362 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
363 unsigned long end, struct mempolicy *new)
365 struct vm_area_struct *next;
369 for (; vma && vma->vm_start < end; vma = next) {
371 if (vma->vm_start < start)
372 err = split_vma(vma->vm_mm, vma, start, 1);
373 if (!err && vma->vm_end > end)
374 err = split_vma(vma->vm_mm, vma, end, 0);
376 err = policy_vma(vma, new);
383 /* Change policy for a memory range */
384 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
386 unsigned long __user *nmask, unsigned long maxnode,
389 struct vm_area_struct *vma;
390 struct mm_struct *mm = current->mm;
391 struct mempolicy *new;
396 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
398 if (start & ~PAGE_MASK)
400 if (mode == MPOL_DEFAULT)
401 flags &= ~MPOL_MF_STRICT;
402 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
409 err = get_nodes(&nodes, nmask, maxnode, mode);
413 new = mpol_new(mode, &nodes);
417 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
418 mode,nodes_addr(nodes)[0]);
420 down_write(&mm->mmap_sem);
421 vma = check_range(mm, start, end, &nodes, flags);
424 err = mbind_range(vma, start, end, new);
425 up_write(&mm->mmap_sem);
430 /* Set the process memory policy */
431 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
432 unsigned long maxnode)
435 struct mempolicy *new;
438 if (mode < 0 || mode > MPOL_MAX)
440 err = get_nodes(&nodes, nmask, maxnode, mode);
443 new = mpol_new(mode, &nodes);
446 mpol_free(current->mempolicy);
447 current->mempolicy = new;
448 if (new && new->policy == MPOL_INTERLEAVE)
449 current->il_next = first_node(new->v.nodes);
453 /* Fill a zone bitmap for a policy */
454 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
461 for (i = 0; p->v.zonelist->zones[i]; i++)
462 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
466 case MPOL_INTERLEAVE:
470 /* or use current node instead of online map? */
471 if (p->v.preferred_node < 0)
472 *nodes = node_online_map;
474 node_set(p->v.preferred_node, *nodes);
481 static int lookup_node(struct mm_struct *mm, unsigned long addr)
486 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
488 err = page_to_nid(p);
494 /* Copy a kernel node mask to user space */
495 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
498 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
499 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
502 if (copy > PAGE_SIZE)
504 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
508 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
511 /* Retrieve NUMA policy */
512 asmlinkage long sys_get_mempolicy(int __user *policy,
513 unsigned long __user *nmask,
514 unsigned long maxnode,
515 unsigned long addr, unsigned long flags)
518 struct mm_struct *mm = current->mm;
519 struct vm_area_struct *vma = NULL;
520 struct mempolicy *pol = current->mempolicy;
522 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
524 if (nmask != NULL && maxnode < MAX_NUMNODES)
526 if (flags & MPOL_F_ADDR) {
527 down_read(&mm->mmap_sem);
528 vma = find_vma_intersection(mm, addr, addr+1);
530 up_read(&mm->mmap_sem);
533 if (vma->vm_ops && vma->vm_ops->get_policy)
534 pol = vma->vm_ops->get_policy(vma, addr);
536 pol = vma->vm_policy;
541 pol = &default_policy;
543 if (flags & MPOL_F_NODE) {
544 if (flags & MPOL_F_ADDR) {
545 err = lookup_node(mm, addr);
549 } else if (pol == current->mempolicy &&
550 pol->policy == MPOL_INTERLEAVE) {
551 pval = current->il_next;
560 up_read(¤t->mm->mmap_sem);
564 if (policy && put_user(pval, policy))
570 get_zonemask(pol, &nodes);
571 err = copy_nodes_to_user(nmask, maxnode, &nodes);
576 up_read(¤t->mm->mmap_sem);
582 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
583 compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode,
585 compat_ulong_t addr, compat_ulong_t flags)
588 unsigned long __user *nm = NULL;
589 unsigned long nr_bits, alloc_size;
590 DECLARE_BITMAP(bm, MAX_NUMNODES);
592 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
593 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
596 nm = compat_alloc_user_space(alloc_size);
598 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
601 err = copy_from_user(bm, nm, alloc_size);
602 /* ensure entire bitmap is zeroed */
603 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
604 err |= compat_put_bitmap(nmask, bm, nr_bits);
610 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
611 compat_ulong_t maxnode)
614 unsigned long __user *nm = NULL;
615 unsigned long nr_bits, alloc_size;
616 DECLARE_BITMAP(bm, MAX_NUMNODES);
618 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
619 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
622 err = compat_get_bitmap(bm, nmask, nr_bits);
623 nm = compat_alloc_user_space(alloc_size);
624 err |= copy_to_user(nm, bm, alloc_size);
630 return sys_set_mempolicy(mode, nm, nr_bits+1);
633 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
634 compat_ulong_t mode, compat_ulong_t __user *nmask,
635 compat_ulong_t maxnode, compat_ulong_t flags)
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
646 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
647 nm = compat_alloc_user_space(alloc_size);
648 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
654 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
659 /* Return effective policy for a VMA */
661 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
663 struct mempolicy *pol = task->mempolicy;
666 if (vma->vm_ops && vma->vm_ops->get_policy)
667 pol = vma->vm_ops->get_policy(vma, addr);
668 else if (vma->vm_policy &&
669 vma->vm_policy->policy != MPOL_DEFAULT)
670 pol = vma->vm_policy;
673 pol = &default_policy;
677 /* Return a zonelist representing a mempolicy */
678 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
682 switch (policy->policy) {
684 nd = policy->v.preferred_node;
689 /* Lower zones don't get a policy applied */
690 /* Careful: current->mems_allowed might have moved */
691 if (gfp_zone(gfp) >= policy_zone)
692 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
693 return policy->v.zonelist;
695 case MPOL_INTERLEAVE: /* should not happen */
703 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
706 /* Do dynamic interleaving for a process */
707 static unsigned interleave_nodes(struct mempolicy *policy)
710 struct task_struct *me = current;
713 next = next_node(nid, policy->v.nodes);
714 if (next >= MAX_NUMNODES)
715 next = first_node(policy->v.nodes);
720 /* Do static interleaving for a VMA with known offset. */
721 static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
724 unsigned nnodes = nodes_weight(pol->v.nodes);
725 unsigned target = (unsigned)off % nnodes;
731 nid = next_node(nid, pol->v.nodes);
733 } while (c <= target);
737 /* Allocate a page in interleaved policy.
738 Own path because it needs to do special accounting. */
739 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
745 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
746 page = __alloc_pages(gfp, order, zl);
747 if (page && page_zone(page) == zl->zones[0]) {
748 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
755 * alloc_page_vma - Allocate a page for a VMA.
758 * %GFP_USER user allocation.
759 * %GFP_KERNEL kernel allocations,
760 * %GFP_HIGHMEM highmem/user allocations,
761 * %GFP_FS allocation should not call back into a file system.
762 * %GFP_ATOMIC don't sleep.
764 * @vma: Pointer to VMA or NULL if not available.
765 * @addr: Virtual Address of the allocation. Must be inside the VMA.
767 * This function allocates a page from the kernel page pool and applies
768 * a NUMA policy associated with the VMA or the current process.
769 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
770 * mm_struct of the VMA to prevent it from going away. Should be used for
771 * all allocations for pages that will be mapped into
772 * user space. Returns NULL when no page can be allocated.
774 * Should be called with the mm_sem of the vma hold.
777 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
779 struct mempolicy *pol = get_vma_policy(current, vma, addr);
781 cpuset_update_current_mems_allowed();
783 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
788 off += (addr - vma->vm_start) >> PAGE_SHIFT;
789 nid = offset_il_node(pol, vma, off);
791 /* fall back to process interleaving */
792 nid = interleave_nodes(pol);
794 return alloc_page_interleave(gfp, 0, nid);
796 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
800 * alloc_pages_current - Allocate pages.
803 * %GFP_USER user allocation,
804 * %GFP_KERNEL kernel allocation,
805 * %GFP_HIGHMEM highmem allocation,
806 * %GFP_FS don't call back into a file system.
807 * %GFP_ATOMIC don't sleep.
808 * @order: Power of two of allocation size in pages. 0 is a single page.
810 * Allocate a page from the kernel page pool. When not in
811 * interrupt context and apply the current process NUMA policy.
812 * Returns NULL when no page can be allocated.
814 * Don't call cpuset_update_current_mems_allowed() unless
815 * 1) it's ok to take cpuset_sem (can WAIT), and
816 * 2) allocating for current task (not interrupt).
818 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
820 struct mempolicy *pol = current->mempolicy;
822 if ((gfp & __GFP_WAIT) && !in_interrupt())
823 cpuset_update_current_mems_allowed();
824 if (!pol || in_interrupt())
825 pol = &default_policy;
826 if (pol->policy == MPOL_INTERLEAVE)
827 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
828 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
830 EXPORT_SYMBOL(alloc_pages_current);
832 /* Slow path of a mempolicy copy */
833 struct mempolicy *__mpol_copy(struct mempolicy *old)
835 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
838 return ERR_PTR(-ENOMEM);
840 atomic_set(&new->refcnt, 1);
841 if (new->policy == MPOL_BIND) {
842 int sz = ksize(old->v.zonelist);
843 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
844 if (!new->v.zonelist) {
845 kmem_cache_free(policy_cache, new);
846 return ERR_PTR(-ENOMEM);
848 memcpy(new->v.zonelist, old->v.zonelist, sz);
853 /* Slow path of a mempolicy comparison */
854 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
858 if (a->policy != b->policy)
863 case MPOL_INTERLEAVE:
864 return nodes_equal(a->v.nodes, b->v.nodes);
866 return a->v.preferred_node == b->v.preferred_node;
869 for (i = 0; a->v.zonelist->zones[i]; i++)
870 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
872 return b->v.zonelist->zones[i] == NULL;
880 /* Slow path of a mpol destructor. */
881 void __mpol_free(struct mempolicy *p)
883 if (!atomic_dec_and_test(&p->refcnt))
885 if (p->policy == MPOL_BIND)
886 kfree(p->v.zonelist);
887 p->policy = MPOL_DEFAULT;
888 kmem_cache_free(policy_cache, p);
892 * Hugetlb policy. Same as above, just works with node numbers instead of
896 /* Find first node suitable for an allocation */
897 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
899 struct mempolicy *pol = get_vma_policy(current, vma, addr);
901 switch (pol->policy) {
903 return numa_node_id();
905 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
906 case MPOL_INTERLEAVE:
907 return interleave_nodes(pol);
909 return pol->v.preferred_node >= 0 ?
910 pol->v.preferred_node : numa_node_id();
916 /* Find secondary valid nodes for an allocation */
917 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
919 struct mempolicy *pol = get_vma_policy(current, vma, addr);
921 switch (pol->policy) {
924 case MPOL_INTERLEAVE:
928 for (z = pol->v.zonelist->zones; *z; z++)
929 if ((*z)->zone_pgdat->node_id == nid)
940 * Shared memory backing store policy support.
942 * Remember policies even when nobody has shared memory mapped.
943 * The policies are kept in Red-Black tree linked from the inode.
944 * They are protected by the sp->lock spinlock, which should be held
945 * for any accesses to the tree.
948 /* lookup first element intersecting start-end */
949 /* Caller holds sp->lock */
950 static struct sp_node *
951 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
953 struct rb_node *n = sp->root.rb_node;
956 struct sp_node *p = rb_entry(n, struct sp_node, nd);
960 else if (end <= p->start)
968 struct sp_node *w = NULL;
969 struct rb_node *prev = rb_prev(n);
972 w = rb_entry(prev, struct sp_node, nd);
977 return rb_entry(n, struct sp_node, nd);
980 /* Insert a new shared policy into the list. */
981 /* Caller holds sp->lock */
982 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
984 struct rb_node **p = &sp->root.rb_node;
985 struct rb_node *parent = NULL;
990 nd = rb_entry(parent, struct sp_node, nd);
991 if (new->start < nd->start)
993 else if (new->end > nd->end)
998 rb_link_node(&new->nd, parent, p);
999 rb_insert_color(&new->nd, &sp->root);
1000 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1001 new->policy ? new->policy->policy : 0);
1004 /* Find shared policy intersecting idx */
1006 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1008 struct mempolicy *pol = NULL;
1011 if (!sp->root.rb_node)
1013 spin_lock(&sp->lock);
1014 sn = sp_lookup(sp, idx, idx+1);
1016 mpol_get(sn->policy);
1019 spin_unlock(&sp->lock);
1023 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1025 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1026 rb_erase(&n->nd, &sp->root);
1027 mpol_free(n->policy);
1028 kmem_cache_free(sn_cache, n);
1032 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1034 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1045 /* Replace a policy range. */
1046 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1047 unsigned long end, struct sp_node *new)
1049 struct sp_node *n, *new2 = NULL;
1052 spin_lock(&sp->lock);
1053 n = sp_lookup(sp, start, end);
1054 /* Take care of old policies in the same range. */
1055 while (n && n->start < end) {
1056 struct rb_node *next = rb_next(&n->nd);
1057 if (n->start >= start) {
1063 /* Old policy spanning whole new range. */
1066 spin_unlock(&sp->lock);
1067 new2 = sp_alloc(end, n->end, n->policy);
1073 sp_insert(sp, new2);
1081 n = rb_entry(next, struct sp_node, nd);
1085 spin_unlock(&sp->lock);
1087 mpol_free(new2->policy);
1088 kmem_cache_free(sn_cache, new2);
1093 int mpol_set_shared_policy(struct shared_policy *info,
1094 struct vm_area_struct *vma, struct mempolicy *npol)
1097 struct sp_node *new = NULL;
1098 unsigned long sz = vma_pages(vma);
1100 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1102 sz, npol? npol->policy : -1,
1103 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1106 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1110 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1112 kmem_cache_free(sn_cache, new);
1116 /* Free a backing policy store on inode delete. */
1117 void mpol_free_shared_policy(struct shared_policy *p)
1120 struct rb_node *next;
1122 if (!p->root.rb_node)
1124 spin_lock(&p->lock);
1125 next = rb_first(&p->root);
1127 n = rb_entry(next, struct sp_node, nd);
1128 next = rb_next(&n->nd);
1129 rb_erase(&n->nd, &p->root);
1130 mpol_free(n->policy);
1131 kmem_cache_free(sn_cache, n);
1133 spin_unlock(&p->lock);
1136 /* assumes fs == KERNEL_DS */
1137 void __init numa_policy_init(void)
1139 policy_cache = kmem_cache_create("numa_policy",
1140 sizeof(struct mempolicy),
1141 0, SLAB_PANIC, NULL, NULL);
1143 sn_cache = kmem_cache_create("shared_policy_node",
1144 sizeof(struct sp_node),
1145 0, SLAB_PANIC, NULL, NULL);
1147 /* Set interleaving policy for system init. This way not all
1148 the data structures allocated at system boot end up in node zero. */
1150 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1152 printk("numa_policy_init: interleaving failed\n");
1155 /* Reset policy of current process to default.
1156 * Assumes fs == KERNEL_DS */
1157 void numa_default_policy(void)
1159 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);