thp: reduce usage of huge zero page's atomic counter

[karo-tx-linux.git] / include / linux / sched.h
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 98fe95fea30cb0656c4a90e60a71028e7059c536..348f51b0ec92ed02e72a2060eedb03f37cd0995f 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
         io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
  }
  
+void __noreturn do_task_dead(void);
+
  struct nsproxy;
  struct user_namespace;
  
@@ -522,8 +524,9 @@ static inline int get_dumpable(struct mm_struct *mm)
  
  #define MMF_HAS_UPROBES                19      /* has uprobes */
  #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_REAPED         21      /* mm has been already reaped */
-#define MMF_OOM_NOT_REAPABLE   22      /* mm couldn't be reaped */
+#define MMF_OOM_SKIP           21      /* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE           22      /* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE     23      /* mm has ever used the global huge zero page */
  
  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  
@@ -671,7 +674,6 @@ struct signal_struct {
         atomic_t                sigcnt;
         atomic_t                live;
         int                     nr_threads;
-       atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
         struct list_head        thread_head;
  
         wait_queue_head_t       wait_chldexit;  /* for wait4() */
@@ -804,6 +806,8 @@ struct signal_struct {
         short oom_score_adj;            /* OOM kill score adjustment */
         short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                          * Only settable by CAP_SYS_RESOURCE. */
+       struct mm_struct *oom_mm;       /* recorded mm when the thread group got
+                                        * killed by the oom killer */
  
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@ -1022,7 +1026,8 @@ extern void wake_up_q(struct wake_q_head *head);
  #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
  #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
  #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
+#define SD_ASYM_CPUCAPACITY    0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu capacity */
  #define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
  #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
@@ -1064,6 +1069,12 @@ extern int sched_domain_level_max;
  
  struct sched_group;
  
+struct sched_domain_shared {
+       atomic_t        ref;
+       atomic_t        nr_busy_cpus;
+       int             has_idle_cores;
+};
+
  struct sched_domain {
         /* These fields must be setup */
         struct sched_domain *parent;    /* top domain must be null terminated */
@@ -1094,6 +1105,8 @@ struct sched_domain {
         u64 max_newidle_lb_cost;
         unsigned long next_decay_max_lb_cost;
  
+       u64 avg_scan_cost;              /* select_idle_sibling */
+
  #ifdef CONFIG_SCHEDSTATS
         /* load_balance() stats */
         unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1132,6 +1145,7 @@ struct sched_domain {
                 void *private;          /* used during construction */
                 struct rcu_head rcu;    /* used during destruction */
         };
+       struct sched_domain_shared *shared;
  
         unsigned int span_weight;
         /*
@@ -1165,6 +1179,7 @@ typedef int (*sched_domain_flags_f)(void);
  
  struct sd_data {
         struct sched_domain **__percpu sd;
+       struct sched_domain_shared **__percpu sds;
         struct sched_group **__percpu sg;
         struct sched_group_capacity **__percpu sgc;
  };
@@ -1458,6 +1473,13 @@ struct tlbflush_unmap_batch {
  };
  
  struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * For reasons of header soup (see current_thread_info()), this
+        * must be the first element of task_struct.
+        */
+       struct thread_info thread_info;
+#endif
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
         atomic_t usage;
@@ -1467,6 +1489,9 @@ struct task_struct {
  #ifdef CONFIG_SMP
         struct llist_node wake_entry;
         int on_cpu;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       unsigned int cpu;       /* current CPU */
+#endif
         unsigned int wakee_flips;
         unsigned long wakee_flip_decay_ts;
         struct task_struct *last_wakee;
@@ -1923,6 +1948,13 @@ struct task_struct {
  #ifdef CONFIG_MMU
         struct task_struct *oom_reaper_list;
  #endif
+#ifdef CONFIG_VMAP_STACK
+       struct vm_struct *stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       /* A live task holds one reference. */
+       atomic_t stack_refcount;
+#endif
  /* CPU-specific state of this task */
         struct thread_struct thread;
  /*
@@ -1939,6 +1971,18 @@ extern int arch_task_struct_size __read_mostly;
  # define arch_task_struct_size (sizeof(struct task_struct))
  #endif
  
+#ifdef CONFIG_VMAP_STACK
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return t->stack_vm_area;
+}
+#else
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return NULL;
+}
+#endif
+
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  
@@ -2568,12 +2612,14 @@ static inline bool is_idle_task(const struct task_struct *p)
         return p->pid == 0;
  }
  extern struct task_struct *curr_task(int cpu);
-extern void set_curr_task(int cpu, struct task_struct *p);
+extern void ia64_set_curr_task(int cpu, struct task_struct *p);
  
  void yield(void);
  
  union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
         struct thread_info thread_info;
+#endif
         unsigned long stack[THREAD_SIZE/sizeof(long)];
  };
  
@@ -2832,6 +2878,20 @@ static inline void mmdrop(struct mm_struct *mm)
                 __mmdrop(mm);
  }
  
+static inline void mmdrop_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+       __mmdrop(mm);
+}
+
+static inline void mmdrop_async(struct mm_struct *mm)
+{
+       if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+               INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+               schedule_work(&mm->async_put_work);
+       }
+}
+
  static inline bool mmget_not_zero(struct mm_struct *mm)
  {
         return atomic_inc_not_zero(&mm->mm_users);
@@ -3061,10 +3121,34 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
         cgroup_threadgroup_change_end(tsk);
  }
  
-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+       return &task->thread_info;
+}
+
+/*
+ * When accessing the stack of a non-current task that might exit, use
+ * try_get_task_stack() instead.  task_stack_page will return a pointer
+ * that could get freed out from under you.
+ */
+static inline void *task_stack_page(const struct task_struct *task)
+{
+       return task->stack;
+}
+
+#define setup_thread_stack(new,old)    do { } while(0)
+
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+       return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
  
  #define task_thread_info(task) ((struct thread_info *)(task)->stack)
-#define task_stack_page(task)  ((task)->stack)
+#define task_stack_page(task)  ((void *)(task)->stack)
  
  static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
  {
@@ -3091,6 +3175,24 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
  }
  
  #endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+       return atomic_inc_not_zero(&tsk->stack_refcount) ?
+               task_stack_page(tsk) : NULL;
+}
+
+extern void put_task_stack(struct task_struct *tsk);
+#else
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+       return task_stack_page(tsk);
+}
+
+static inline void put_task_stack(struct task_struct *tsk) {}
+#endif
+
  #define task_stack_end_corrupted(task) \
                 (*(end_of_stack(task)) != STACK_END_MAGIC)
  
@@ -3206,7 +3308,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
   * cond_resched_lock() will drop the spinlock before scheduling,
   * cond_resched_softirq() will enable bhs before scheduling.
   */
+#ifndef CONFIG_PREEMPT
  extern int _cond_resched(void);
+#else
+static inline int _cond_resched(void) { return 0; }
+#endif
  
  #define cond_resched() ({                      \
         ___might_sleep(__FILE__, __LINE__, 0);  \
@@ -3236,6 +3342,15 @@ static inline void cond_resched_rcu(void)
  #endif
  }
  
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       return p->preempt_disable_ip;
+#else
+       return 0;
+#endif
+}
+
  /*
   * Does a critical section need to be broken due to another
   * task waiting?: (technically does not depend on CONFIG_PREEMPT,
@@ -3364,7 +3479,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
  
  static inline unsigned int task_cpu(const struct task_struct *p)
  {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       return p->cpu;
+#else
         return task_thread_info(p)->cpu;
+#endif
  }
  
  static inline int task_node(const struct task_struct *p)