]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
mm: introduce VM_LOCKONFAULT
authorEric B Munson <emunson@akamai.com>
Wed, 21 Oct 2015 22:03:25 +0000 (09:03 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 21 Oct 2015 22:03:25 +0000 (09:03 +1100)
The cost of faulting in all memory to be locked can be very high when
working with large mappings.  If only portions of the mapping will be used
this can incur a high penalty for locking.

For the example of a large file, this is the usage pattern for a large
statical language model (probably applies to other statical or graphical
models as well).  For the security example, any application transacting in
data that cannot be swapped out (credit card data, medical records, etc).

This patch introduces the ability to request that pages are not
pre-faulted, but are placed on the unevictable LRU when they are finally
faulted in.  The VM_LOCKONFAULT flag will be used together with VM_LOCKED
and has no effect when set without VM_LOCKED.  Setting the VM_LOCKONFAULT
flag for a VMA will cause pages faulted into that VMA to be added to the
unevictable LRU when they are faulted or if they are already present, but
will not cause any missing pages to be faulted in.

Exposing this new lock state means that we cannot overload the meaning of
the FOLL_POPULATE flag any longer.  Prior to this patch it was used to
mean that the VMA for a fault was locked.  This means we need the new
FOLL_MLOCK flag to communicate the locked state of a VMA.  FOLL_POPULATE
will now only control if the VMA should be populated and in the case of
VM_LOCKONFAULT, it will not be set.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
12 files changed:
Documentation/filesystems/proc.txt
drivers/gpu/drm/drm_vm.c
fs/proc/task_mmu.c
include/linux/mm.h
kernel/fork.c
mm/debug.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/mlock.c
mm/mmap.c
mm/rmap.c

index d6f259eaa5efb384cb503d312808ec06660efba5..937c9d3c4449a88f6bd9ea664f37576636ceb9bb 100644 (file)
@@ -481,6 +481,7 @@ manner. The codes are the following:
     rr  - random read advise provided
     dc  - do not copy area on fork
     de  - do not expand area on remapping
+    lf  - mark area to lock pages when faulted in, do not pre-populate
     ac  - area is accountable
     nr  - swap space is not reserved for the area
     ht  - area uses huge tlb pages
index aab49ee4ed40d2ce5b525554209fd3ffe40340b4..103a5f6b969a87e6bfd0be346554809adb2443b2 100644 (file)
@@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data)
                   (void *)(unsigned long)virt_to_phys(high_memory));
 
        list_for_each_entry(pt, &dev->vmalist, head) {
+               char lock_flag = '-';
+
                vma = pt->vma;
                if (!vma)
                        continue;
+               if (vma->vm_flags & VM_LOCKONFAULT)
+                       lock_flag = 'f';
+               else if (vma->vm_flags & VM_LOCKED)
+                       lock_flag = 'l';
                seq_printf(m,
                           "\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000",
                           pt->pid,
@@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data)
                           vma->vm_flags & VM_WRITE ? 'w' : '-',
                           vma->vm_flags & VM_EXEC ? 'x' : '-',
                           vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
-                          vma->vm_flags & VM_LOCKED ? 'l' : '-',
+                          lock_flag,
                           vma->vm_flags & VM_IO ? 'i' : '-',
                           vma->vm_pgoff);
 
index a323b6258bf3ea508f23fa48702c5df068dcc376..8ad0116e225b8a997c2792a73817d4412ada0911 100644 (file)
@@ -601,6 +601,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_RAND_READ)]   = "rr",
                [ilog2(VM_DONTCOPY)]    = "dc",
                [ilog2(VM_DONTEXPAND)]  = "de",
+               [ilog2(VM_LOCKONFAULT)] = "lf",
                [ilog2(VM_ACCOUNT)]     = "ac",
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
index 3c258f8eb9ae76017124af4892cb1305634006c2..505e19c45f6aa67d920692be90367d7851242d99 100644 (file)
@@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT 0x00080000      /* Lock the pages covered when they are faulted in */
 #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
 #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
@@ -2137,6 +2138,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
+#define FOLL_MLOCK     0x1000  /* lock present pages */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
index 2845623fb58264eec28a8b99b48d4511856e718d..69504920965780a67eee85284c103ed84cd46e56 100644 (file)
@@ -454,7 +454,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+               tmp->vm_flags &= ~(VM_LOCKED | VM_UFFD_MISSING | VM_UFFD_WP |
+                                       VM_LOCKONFAULT);
                tmp->vm_next = tmp->vm_prev = NULL;
                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
index 6c1b3ea61bfddfe4f042a6ef067e53e34f82792b..e784110fb51d1056e28ada2bd5bd26b859e68778 100644 (file)
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
        {VM_GROWSDOWN,                  "growsdown"     },
        {VM_PFNMAP,                     "pfnmap"        },
        {VM_DENYWRITE,                  "denywrite"     },
+       {VM_LOCKONFAULT,                "lockonfault"   },
        {VM_LOCKED,                     "locked"        },
        {VM_IO,                         "io"            },
        {VM_SEQ_READ,                   "seqread"       },
index a798293fc6486bac215ecb58ed071263a5f775f0..deafa2c91b362206b7ef56aec918509a9cc24dd2 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -129,7 +129,7 @@ retry:
                 */
                mark_page_accessed(page);
        }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned int fault_flags = 0;
        int ret;
 
+       /* mlock all present pages, but do not fault in new pages */
+       if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+               return -ENOENT;
        /* For mm_populate(), just skip the stack guard page. */
        if ((*flags & FOLL_POPULATE) &&
                        (stack_guard_page_start(vma, address) ||
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
 
-       gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+       if (vma->vm_flags & VM_LOCKONFAULT)
+               gup_flags &= ~FOLL_POPULATE;
+
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
index 8e426175fb7efac22e9ebd2ba4f1d85ae3eb6d7a..d901400ccfab6304a25078af560191688ea045f0 100644 (file)
@@ -1365,7 +1365,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
index 06580fdf5e306825b961f794b36fcf9544361abc..c3e78521c44a4224e0b28d38f9fac1e00ea06cfb 100644 (file)
@@ -4171,8 +4171,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
        unsigned long s_end = sbase + PUD_SIZE;
 
        /* Allow segments to share if only one is marked locked */
-       unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
-       unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+       unsigned long vm_flags = vma->vm_flags & ~(VM_LOCKED|VM_LOCKONFAULT);
+       unsigned long svm_flags = svma->vm_flags & ~(VM_LOCKED|VM_LOCKONFAULT);
 
        /*
         * match the virtual addresses, permission and the alignment of the
index 35dcf8fa7195f4803314122e01a2f3e3f03bc316..15c94d8ae05c0b1727c61b2f36213020877651f7 100644 (file)
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
-       vma->vm_flags &= ~VM_LOCKED;
+       vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 
        while (start < end) {
                struct page *page = NULL;
index 220effde8ea3ecde2d5afafdc2bac6d0099de31e..03b654ecf5fb99d7a5ee27fe320f36a8ec0ec971 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1661,7 +1661,7 @@ out:
                                        vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
                else
-                       vma->vm_flags &= ~VM_LOCKED;
+                       vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
        }
 
        if (file)
index b577fbb98d4baf352fa5e51cc536d02356b8392e..50664c54df982424398858fc48dc6ce6358cec60 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -848,7 +848,8 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 
                if (vma->vm_flags & VM_LOCKED) {
                        spin_unlock(ptl);
-                       pra->vm_flags |= VM_LOCKED;
+                       pra->vm_flags |=
+                               (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT));
                        return SWAP_FAIL; /* To break the loop */
                }
 
@@ -869,7 +870,8 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 
                if (vma->vm_flags & VM_LOCKED) {
                        pte_unmap_unlock(pte, ptl);
-                       pra->vm_flags |= VM_LOCKED;
+                       pra->vm_flags |=
+                               (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT));
                        return SWAP_FAIL; /* To break the loop */
                }