mm: limit growth of 3% hardcoded other user reserve

author Andrew Shewmaker <agshew@gmail.com>

Mon, 29 Apr 2013 22:08:10 +0000 (15:08 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 29 Apr 2013 22:54:36 +0000 (15:54 -0700)
author Andrew Shewmaker <agshew@gmail.com>
Mon, 29 Apr 2013 22:08:10 +0000 (15:08 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Apr 2013 22:54:36 +0000 (15:54 -0700)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 078701fdbd4dd936dd485737abe7b5b60e554604..f6989573835796b845eb5d8715af2893f637200b 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm:
  - percpu_pagelist_fraction
  - stat_interval
  - swappiness
+- user_reserve_kbytes
  - vfs_cache_pressure
  - zone_reclaim_mode
  
@@ -542,6 +543,7 @@ memory until it actually runs out.
  
  When this flag is 2, the kernel uses a "never overcommit"
  policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
  
  This feature can be very useful because there are a lot of
  programs that malloc() huge amounts of memory "just-in-case"
@@ -645,6 +647,24 @@ The default value is 60.
  
  ==============================================================
  
+- user_reserve_kbytes
+
+When overcommit_memory is set to 2, "never overommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
  vfs_cache_pressure
  ------------------
  
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting

index 706d7ed9d8d2a7b6a4d91aacf2e4bf449f14b713..8eaa2fc4b8fae253930a798f38394438198dbf5a 100644 (file)
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes
                 default.
  
  1      -       Always overcommit. Appropriate for some scientific
-               applications.
+               applications. Classic example is code using sparse arrays
+               and just relying on the virtual memory consisting almost
+               entirely of zero pages.
  
  2      -       Don't overcommit. The total address space commit
                 for the system is not permitted to exceed swap + a
@@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes
                 pages but will receive errors on memory allocation as
                 appropriate.
  
+               Useful for applications that want to guarantee their
+               memory allocations will be available in the future
+               without having to initialize every page.
+
  The overcommit policy is set via the sysctl `vm.overcommit_memory'.
  
  The overcommit percentage is set via `vm.overcommit_ratio'.
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 7aa11a6736ebe1ae60a2ded220193251396d374a..43cfaabbde40870c307ad24f0a383b7bb580a31f 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout;
  #include <asm/pgtable.h>
  #include <asm/processor.h>
  
+extern unsigned long sysctl_user_reserve_kbytes;
+
  #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
  
  /* to align the pointer to the (next) page boundary */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 3dadde52253cfa4d385ce0ce9ff3498427b72885..6daabb72bdb5445f0985fc7aa25ff1c101e767e7 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1429,6 +1429,13 @@ static struct ctl_table vm_table[] = {
                 .extra2         = &one,
         },
  #endif
+       {
+               .procname       = "user_reserve_kbytes",
+               .data           = &sysctl_user_reserve_kbytes,
+               .maxlen         = sizeof(sysctl_user_reserve_kbytes),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+       },
         { }
  };
  
diff --git a/mm/mmap.c b/mm/mmap.c

index 081e6da8e1a46e8149b660ad7056ace8bca1158c..80a965f352513b35bef2463051ab13476eff6a42 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
  int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
  int sysctl_overcommit_ratio __read_mostly = 50;        /* default is 50% */
  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
  /*
   * Make sure vm_committed_as in one cacheline and not cacheline shared with
   * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
   */
  int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
  {
-       unsigned long free, allowed;
+       unsigned long free, allowed, reserve;
  
         vm_acct_memory(pages);
  
@@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 allowed -= allowed / 32;
         allowed += total_swap_pages;
  
-       /* Don't let a single process grow too big:
-          leave 3% of the size of this process for other processes */
-       if (mm)
-               allowed -= mm->total_vm / 32;
+       /*
+        * Don't let a single process grow so big a user can't recover
+        */
+       if (mm) {
+               reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+               allowed -= min(mm->total_vm / 32, reserve);
+       }
  
         if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                 return 0;
@@ -3094,3 +3098,24 @@ void __init mmap_init(void)
         ret = percpu_counter_init(&vm_committed_as, 0);
         VM_BUG_ON(ret);
  }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+       return 0;
+}
+module_init(init_user_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c

index 2f1c75ed468e030929354b07226dbc3df1e59ffb..58e4a0a5125fc61841bf5aa60ac5830977675249 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
  int sysctl_overcommit_ratio = 50; /* default is 50% */
  int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
  int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
  int heap_stack_gap = 0;
  
  atomic_long_t mmap_pages_allocated;
@@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
   */
  int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
  {
-       unsigned long free, allowed;
+       unsigned long free, allowed, reserve;
  
         vm_acct_memory(pages);
  
@@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 allowed -= allowed / 32;
         allowed += total_swap_pages;
  
-       /* Don't let a single process grow too big:
-          leave 3% of the size of this process for other processes */
-       if (mm)
-               allowed -= mm->total_vm / 32;
+       /*
+        * Don't let a single process grow so big a user can't recover
+        */
+       if (mm) {
+               reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+               allowed -= min(mm->total_vm / 32, reserve);
+       }
  
         if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                 return 0;
@@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         up_write(&nommu_region_sem);
         return 0;
  }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+       return 0;
+}
+module_init(init_user_reserve)
author	Andrew Shewmaker <agshew@gmail.com>
	Mon, 29 Apr 2013 22:08:10 +0000 (15:08 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 29 Apr 2013 22:54:36 +0000 (15:54 -0700)
Documentation/sysctl/vm.txt		patch \| blob \| history
Documentation/vm/overcommit-accounting		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/nommu.c		patch \| blob \| history