Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)
diff --combined arch/x86/events/core.c

index e6f5e4b163ac693dd81e48fda94338866445a9d5,d3990462582c5d69ffa26a3670236fb4227cfc36..628b8c556aabae93ec829445286b2bed844a0f38
--- 1/arch/x86/events/core.c
--- 2/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@@ -1750,8 -1750,6 +1750,8 @@@ ssize_t x86_event_sysfs_show(char *page
         return ret;
   }
   
+ +static struct attribute_group x86_pmu_attr_group;
+ +
   static int __init init_hw_perf_events(void)
   {
         struct x86_pmu_quirk *quirk;
@@@ -1815,14 -1813,6 +1815,14 @@@
                         x86_pmu_events_group.attrs = tmp;
         }
   
+ +      if (x86_pmu.attrs) {
+ +              struct attribute **tmp;
+ +
+ +              tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ +              if (!WARN_ON(!tmp))
+ +                      x86_pmu_attr_group.attrs = tmp;
+ +      }
+ +
         pr_info("... version:                %d\n",     x86_pmu.version);
         pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
         pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
@@@ -2265,7 -2255,7 +2265,7 @@@ static struct pmu pmu = 
   void arch_perf_update_userpage(struct perf_event *event,
                                struct perf_event_mmap_page *userpg, u64 now)
   {
-       struct cyc2ns_data *data;
+       struct cyc2ns_data data;
         u64 offset;
   
         userpg->cap_user_time = 0;
@@@ -2277,17 -2267,17 +2277,17 @@@
         if (!using_native_sched_clock() || !sched_clock_stable())
                 return;
   
-       data = cyc2ns_read_begin();
+       cyc2ns_read_begin(&data);
   
-       offset = data->cyc2ns_offset + __sched_clock_offset;
+       offset = data.cyc2ns_offset + __sched_clock_offset;
   
         /*
          * Internal timekeeping for enabled/running/stopped times
          * is always in the local_clock domain.
          */
         userpg->cap_user_time = 1;
-       userpg->time_mult = data->cyc2ns_mul;
-       userpg->time_shift = data->cyc2ns_shift;
+       userpg->time_mult = data.cyc2ns_mul;
+       userpg->time_shift = data.cyc2ns_shift;
         userpg->time_offset = offset - now;
   
         /*
@@@ -2299,7 -2289,7 +2299,7 @@@
                 userpg->time_zero = offset;
         }
   
-       cyc2ns_read_end(data);
+       cyc2ns_read_end();
   }
   
   void
diff --combined block/blk-mq.c

index 05dfa3f270ae6a9f3cbb767db61adfb393080bd6,07b0a03c46e6ac0b901d38c57529b953c325b6d0..ced2b000ca028c91a79ee959408f3f6ee84c9585
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -42,6 -42,7 +42,6 @@@ static LIST_HEAD(all_q_list)
   
   static void blk_mq_poll_stats_start(struct request_queue *q);
   static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
- -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
   
   static int blk_mq_poll_stats_bkt(const struct request *rq)
   {
@@@ -153,28 -154,13 +153,28 @@@ void blk_mq_unfreeze_queue(struct reque
   }
   EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
   
+ +/*
+ + * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
+ + * mpt3sas driver such that this function can be removed.
+ + */
+ +void blk_mq_quiesce_queue_nowait(struct request_queue *q)
+ +{
+ +      unsigned long flags;
+ +
+ +      spin_lock_irqsave(q->queue_lock, flags);
+ +      queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ +      spin_unlock_irqrestore(q->queue_lock, flags);
+ +}
+ +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
+ +
   /**
- - * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
    * @q: request queue.
    *
    * Note: this function does not prevent that the struct request end_io()
- - * callback function is invoked. Additionally, it is not prevented that
- - * new queue_rq() calls occur unless the queue has been stopped first.
+ + * callback function is invoked. Once this function is returned, we make
+ + * sure no dispatch can happen until the queue is unquiesced via
+ + * blk_mq_unquiesce_queue().
    */
   void blk_mq_quiesce_queue(struct request_queue *q)
   {
@@@ -182,11 -168,11 +182,11 @@@
         unsigned int i;
         bool rcu = false;
   
- -      __blk_mq_stop_hw_queues(q, true);
+ +      blk_mq_quiesce_queue_nowait(q);
   
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (hctx->flags & BLK_MQ_F_BLOCKING)
- -                      synchronize_srcu(&hctx->queue_rq_srcu);
+ +                      synchronize_srcu(hctx->queue_rq_srcu);
                 else
                         rcu = true;
         }
@@@ -195,26 -181,6 +195,26 @@@
   }
   EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
   
+ +/*
+ + * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
+ + * @q: request queue.
+ + *
+ + * This function recovers queue into the state before quiescing
+ + * which is done by blk_mq_quiesce_queue.
+ + */
+ +void blk_mq_unquiesce_queue(struct request_queue *q)
+ +{
+ +      unsigned long flags;
+ +
+ +      spin_lock_irqsave(q->queue_lock, flags);
+ +      queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ +      spin_unlock_irqrestore(q->queue_lock, flags);
+ +
+ +      /* dispatch requests which are inserted during quiescing */
+ +      blk_mq_run_hw_queues(q, true);
+ +}
+ +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+ +
   void blk_mq_wake_waiters(struct request_queue *q)
   {
         struct blk_mq_hw_ctx *hctx;
@@@ -238,33 -204,15 +238,33 @@@ bool blk_mq_can_queue(struct blk_mq_hw_
   }
   EXPORT_SYMBOL(blk_mq_can_queue);
   
- -void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
- -                      struct request *rq, unsigned int op)
+ +static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
+ +              unsigned int tag, unsigned int op)
   {
+ +      struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ +      struct request *rq = tags->static_rqs[tag];
+ +
+ +      rq->rq_flags = 0;
+ +
+ +      if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ +              rq->tag = -1;
+ +              rq->internal_tag = tag;
+ +      } else {
+ +              if (blk_mq_tag_busy(data->hctx)) {
+ +                      rq->rq_flags = RQF_MQ_INFLIGHT;
+ +                      atomic_inc(&data->hctx->nr_active);
+ +              }
+ +              rq->tag = tag;
+ +              rq->internal_tag = -1;
+ +              data->hctx->tags->rqs[rq->tag] = rq;
+ +      }
+ +
         INIT_LIST_HEAD(&rq->queuelist);
         /* csd/requeue_work/fifo_time is initialized before use */
- -      rq->q = q;
- -      rq->mq_ctx = ctx;
+ +      rq->q = data->q;
+ +      rq->mq_ctx = data->ctx;
         rq->cmd_flags = op;
- -      if (blk_queue_io_stat(q))
+ +      if (blk_queue_io_stat(data->q))
                 rq->rq_flags |= RQF_IO_STAT;
         /* do not touch atomic flags, it needs atomic ops against the timer */
         rq->cpu = -1;
@@@ -293,60 -241,44 +293,60 @@@
         rq->end_io_data = NULL;
         rq->next_rq = NULL;
   
- -      ctx->rq_dispatched[op_is_sync(op)]++;
+ +      data->ctx->rq_dispatched[op_is_sync(op)]++;
+ +      return rq;
   }
- -EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
   
- -struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
- -                                     unsigned int op)
+ +static struct request *blk_mq_get_request(struct request_queue *q,
+ +              struct bio *bio, unsigned int op,
+ +              struct blk_mq_alloc_data *data)
   {
+ +      struct elevator_queue *e = q->elevator;
         struct request *rq;
         unsigned int tag;
   
- -      tag = blk_mq_get_tag(data);
- -      if (tag != BLK_MQ_TAG_FAIL) {
- -              struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ +      blk_queue_enter_live(q);
+ +      data->q = q;
+ +      if (likely(!data->ctx))
+ +              data->ctx = blk_mq_get_ctx(q);
+ +      if (likely(!data->hctx))
+ +              data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
+ +      if (op & REQ_NOWAIT)
+ +              data->flags |= BLK_MQ_REQ_NOWAIT;
   
- -              rq = tags->static_rqs[tag];
+ +      if (e) {
+ +              data->flags |= BLK_MQ_REQ_INTERNAL;
   
- -              if (data->flags & BLK_MQ_REQ_INTERNAL) {
- -                      rq->tag = -1;
- -                      rq->internal_tag = tag;
- -              } else {
- -                      if (blk_mq_tag_busy(data->hctx)) {
- -                              rq->rq_flags = RQF_MQ_INFLIGHT;
- -                              atomic_inc(&data->hctx->nr_active);
- -                      }
- -                      rq->tag = tag;
- -                      rq->internal_tag = -1;
- -                      data->hctx->tags->rqs[rq->tag] = rq;
- -              }
+ +              /*
+ +               * Flush requests are special and go directly to the
+ +               * dispatch list.
+ +               */
+ +              if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+ +                      e->type->ops.mq.limit_depth(op, data);
+ +      }
   
- -              blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
- -              return rq;
+ +      tag = blk_mq_get_tag(data);
+ +      if (tag == BLK_MQ_TAG_FAIL) {
+ +              blk_queue_exit(q);
+ +              return NULL;
         }
   
- -      return NULL;
+ +      rq = blk_mq_rq_ctx_init(data, tag, op);
+ +      if (!op_is_flush(op)) {
+ +              rq->elv.icq = NULL;
+ +              if (e && e->type->ops.mq.prepare_request) {
+ +                      if (e->type->icq_cache && rq_ioc(bio))
+ +                              blk_mq_sched_assign_ioc(rq, bio);
+ +
+ +                      e->type->ops.mq.prepare_request(rq, bio);
+ +                      rq->rq_flags |= RQF_ELVPRIV;
+ +              }
+ +      }
+ +      data->hctx->queued++;
+ +      return rq;
   }
- -EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
   
- -struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                 unsigned int flags)
   {
         struct blk_mq_alloc_data alloc_data = { .flags = flags };
@@@ -357,7 -289,7 +357,7 @@@
         if (ret)
                 return ERR_PTR(ret);
   
- -      rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+ +      rq = blk_mq_get_request(q, NULL, op, &alloc_data);
   
         blk_mq_put_ctx(alloc_data.ctx);
         blk_queue_exit(q);
@@@ -372,8 -304,8 +372,8 @@@
   }
   EXPORT_SYMBOL(blk_mq_alloc_request);
   
- -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
- -              unsigned int flags, unsigned int hctx_idx)
+ +struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
+ +              unsigned int op, unsigned int flags, unsigned int hctx_idx)
   {
         struct blk_mq_alloc_data alloc_data = { .flags = flags };
         struct request *rq;
@@@ -408,7 -340,7 +408,7 @@@
         cpu = cpumask_first(alloc_data.hctx->cpumask);
         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
   
- -      rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+ +      rq = blk_mq_get_request(q, NULL, op, &alloc_data);
   
         blk_queue_exit(q);
   
@@@ -419,28 -351,17 +419,28 @@@
   }
   EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
   
- -void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- -                           struct request *rq)
+ +void blk_mq_free_request(struct request *rq)
   {
- -      const int sched_tag = rq->internal_tag;
         struct request_queue *q = rq->q;
+ +      struct elevator_queue *e = q->elevator;
+ +      struct blk_mq_ctx *ctx = rq->mq_ctx;
+ +      struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+ +      const int sched_tag = rq->internal_tag;
   
+ +      if (rq->rq_flags & RQF_ELVPRIV) {
+ +              if (e && e->type->ops.mq.finish_request)
+ +                      e->type->ops.mq.finish_request(rq);
+ +              if (rq->elv.icq) {
+ +                      put_io_context(rq->elv.icq->ioc);
+ +                      rq->elv.icq = NULL;
+ +              }
+ +      }
+ +
+ +      ctx->rq_completed[rq_is_sync(rq)]++;
         if (rq->rq_flags & RQF_MQ_INFLIGHT)
                 atomic_dec(&hctx->nr_active);
   
         wbt_done(q->rq_wb, &rq->issue_stat);
- -      rq->rq_flags = 0;
   
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
         clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@@ -451,9 -372,29 +451,9 @@@
         blk_mq_sched_restart(hctx);
         blk_queue_exit(q);
   }
- -
- -static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
- -                                   struct request *rq)
- -{
- -      struct blk_mq_ctx *ctx = rq->mq_ctx;
- -
- -      ctx->rq_completed[rq_is_sync(rq)]++;
- -      __blk_mq_finish_request(hctx, ctx, rq);
- -}
- -
- -void blk_mq_finish_request(struct request *rq)
- -{
- -      blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
- -}
- -EXPORT_SYMBOL_GPL(blk_mq_finish_request);
- -
- -void blk_mq_free_request(struct request *rq)
- -{
- -      blk_mq_sched_put_request(rq);
- -}
   EXPORT_SYMBOL_GPL(blk_mq_free_request);
   
- -inline void __blk_mq_end_request(struct request *rq, int error)
+ +inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
   {
         blk_account_io_done(rq);
   
@@@ -468,7 -409,7 +468,7 @@@
   }
   EXPORT_SYMBOL(__blk_mq_end_request);
   
- -void blk_mq_end_request(struct request *rq, int error)
+ +void blk_mq_end_request(struct request *rq, blk_status_t error)
   {
         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                 BUG();
@@@ -812,6 -753,50 +812,6 @@@ static void blk_mq_timeout_work(struct 
         blk_queue_exit(q);
   }
   
- -/*
- - * Reverse check our software queue for entries that we could potentially
- - * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- - * too much time checking for merges.
- - */
- -static bool blk_mq_attempt_merge(struct request_queue *q,
- -                               struct blk_mq_ctx *ctx, struct bio *bio)
- -{
- -      struct request *rq;
- -      int checked = 8;
- -
- -      list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
- -              bool merged = false;
- -
- -              if (!checked--)
- -                      break;
- -
- -              if (!blk_rq_merge_ok(rq, bio))
- -                      continue;
- -
- -              switch (blk_try_merge(rq, bio)) {
- -              case ELEVATOR_BACK_MERGE:
- -                      if (blk_mq_sched_allow_merge(q, rq, bio))
- -                              merged = bio_attempt_back_merge(q, rq, bio);
- -                      break;
- -              case ELEVATOR_FRONT_MERGE:
- -                      if (blk_mq_sched_allow_merge(q, rq, bio))
- -                              merged = bio_attempt_front_merge(q, rq, bio);
- -                      break;
- -              case ELEVATOR_DISCARD_MERGE:
- -                      merged = bio_attempt_discard_merge(q, rq, bio);
- -                      break;
- -              default:
- -                      continue;
- -              }
- -
- -              if (merged)
- -                      ctx->rq_merged++;
- -              return merged;
- -      }
- -
- -      return false;
- -}
- -
   struct flush_busy_ctx_data {
         struct blk_mq_hw_ctx *hctx;
         struct list_head *list;
@@@ -941,14 -926,14 +941,14 @@@ static bool reorder_tags_to_front(struc
         return first != NULL;
   }
   
- static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
                                 void *key)
   {
         struct blk_mq_hw_ctx *hctx;
   
         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
   
-       list_del(&wait->task_list);
+       list_del(&wait->entry);
         clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
         blk_mq_run_hw_queue(hctx, true);
         return 1;
@@@ -983,7 -968,7 +983,7 @@@ bool blk_mq_dispatch_rq_list(struct req
   {
         struct blk_mq_hw_ctx *hctx;
         struct request *rq;
- -      int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
+ +      int errors, queued;
   
         if (list_empty(list))
                 return false;
@@@ -994,7 -979,6 +994,7 @@@
         errors = queued = 0;
         do {
                 struct blk_mq_queue_data bd;
+ +              blk_status_t ret;
   
                 rq = list_first_entry(list, struct request, queuelist);
                 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
@@@ -1035,20 -1019,25 +1035,20 @@@
                 }
   
                 ret = q->mq_ops->queue_rq(hctx, &bd);
- -              switch (ret) {
- -              case BLK_MQ_RQ_QUEUE_OK:
- -                      queued++;
- -                      break;
- -              case BLK_MQ_RQ_QUEUE_BUSY:
+ +              if (ret == BLK_STS_RESOURCE) {
                         blk_mq_put_driver_tag_hctx(hctx, rq);
                         list_add(&rq->queuelist, list);
                         __blk_mq_requeue_request(rq);
                         break;
- -              default:
- -                      pr_err("blk-mq: bad return on queue: %d\n", ret);
- -              case BLK_MQ_RQ_QUEUE_ERROR:
+ +              }
+ +
+ +              if (unlikely(ret != BLK_STS_OK)) {
                         errors++;
- -                      blk_mq_end_request(rq, -EIO);
- -                      break;
+ +                      blk_mq_end_request(rq, BLK_STS_IOERR);
+ +                      continue;
                 }
   
- -              if (ret == BLK_MQ_RQ_QUEUE_BUSY)
- -                      break;
+ +              queued++;
         } while (!list_empty(list));
   
         hctx->dispatched[queued_to_index(queued)]++;
@@@ -1086,7 -1075,7 +1086,7 @@@
                  * - blk_mq_run_hw_queue() checks whether or not a queue has
                  *   been stopped before rerunning a queue.
                  * - Some but not all block drivers stop a queue before
- -               *   returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+ +               *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                  *   and dm-rq.
                  */
                 if (!blk_mq_sched_needs_restart(hctx) &&
@@@ -1111,9 -1100,9 +1111,9 @@@ static void __blk_mq_run_hw_queue(struc
         } else {
                 might_sleep();
   
- -              srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ +              srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
                 blk_mq_sched_dispatch_requests(hctx);
- -              srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ +              srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
         }
   }
   
@@@ -1145,10 -1134,8 +1145,10 @@@ static int blk_mq_hctx_next_cpu(struct 
   static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                                         unsigned long msecs)
   {
- -      if (unlikely(blk_mq_hctx_stopped(hctx) ||
- -                   !blk_mq_hw_queue_mapped(hctx)))
+ +      if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
+ +              return;
+ +
+ +      if (unlikely(blk_mq_hctx_stopped(hctx)))
                 return;
   
         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@@ -1214,39 -1201,34 +1214,39 @@@ bool blk_mq_queue_stopped(struct reques
   }
   EXPORT_SYMBOL(blk_mq_queue_stopped);
   
- -static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
+ +/*
+ + * This function is often used for pausing .queue_rq() by driver when
+ + * there isn't enough resource or some conditions aren't satisfied, and
+ + * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ + *
+ + * We do not guarantee that dispatch can be drained or blocked
+ + * after blk_mq_stop_hw_queue() returns. Please use
+ + * blk_mq_quiesce_queue() for that requirement.
+ + */
+ +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
   {
- -      if (sync)
- -              cancel_delayed_work_sync(&hctx->run_work);
- -      else
- -              cancel_delayed_work(&hctx->run_work);
+ +      cancel_delayed_work(&hctx->run_work);
   
         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
   }
- -
- -void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
- -{
- -      __blk_mq_stop_hw_queue(hctx, false);
- -}
   EXPORT_SYMBOL(blk_mq_stop_hw_queue);
   
- -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
+ +/*
+ + * This function is often used for pausing .queue_rq() by driver when
+ + * there isn't enough resource or some conditions aren't satisfied, and
+ + * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ + *
+ + * We do not guarantee that dispatch can be drained or blocked
+ + * after blk_mq_stop_hw_queues() returns. Please use
+ + * blk_mq_quiesce_queue() for that requirement.
+ + */
+ +void blk_mq_stop_hw_queues(struct request_queue *q)
   {
         struct blk_mq_hw_ctx *hctx;
         int i;
   
         queue_for_each_hw_ctx(q, hctx, i)
- -              __blk_mq_stop_hw_queue(hctx, sync);
- -}
- -
- -void blk_mq_stop_hw_queues(struct request_queue *q)
- -{
- -      __blk_mq_stop_hw_queues(q, false);
+ +              blk_mq_stop_hw_queue(hctx);
   }
   EXPORT_SYMBOL(blk_mq_stop_hw_queues);
   
@@@ -1313,7 -1295,7 +1313,7 @@@ static void blk_mq_run_work_fn(struct w
   
   void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
   {
- -      if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+ +      if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
                 return;
   
         /*
@@@ -1335,8 -1317,6 +1335,8 @@@ static inline void __blk_mq_insert_req_
   {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
   
+ +      lockdep_assert_held(&ctx->lock);
+ +
         trace_block_rq_insert(hctx->queue, rq);
   
         if (at_head)
@@@ -1350,8 -1330,6 +1350,8 @@@ void __blk_mq_insert_request(struct blk
   {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
   
+ +      lockdep_assert_held(&ctx->lock);
+ +
         __blk_mq_insert_req_list(hctx, rq, at_head);
         blk_mq_hctx_mark_pending(hctx, ctx);
   }
@@@ -1449,13 -1427,30 +1449,13 @@@ static inline bool hctx_allow_merges(st
                 !blk_queue_nomerges(hctx->queue);
   }
   
- -static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
- -                                       struct blk_mq_ctx *ctx,
- -                                       struct request *rq, struct bio *bio)
+ +static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
+ +                                 struct blk_mq_ctx *ctx,
+ +                                 struct request *rq)
   {
- -      if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
- -              blk_mq_bio_to_request(rq, bio);
- -              spin_lock(&ctx->lock);
- -insert_rq:
- -              __blk_mq_insert_request(hctx, rq, false);
- -              spin_unlock(&ctx->lock);
- -              return false;
- -      } else {
- -              struct request_queue *q = hctx->queue;
- -
- -              spin_lock(&ctx->lock);
- -              if (!blk_mq_attempt_merge(q, ctx, bio)) {
- -                      blk_mq_bio_to_request(rq, bio);
- -                      goto insert_rq;
- -              }
- -
- -              spin_unlock(&ctx->lock);
- -              __blk_mq_finish_request(hctx, ctx, rq);
- -              return true;
- -      }
+ +      spin_lock(&ctx->lock);
+ +      __blk_mq_insert_request(hctx, rq, false);
+ +      spin_unlock(&ctx->lock);
   }
   
   static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
@@@ -1476,11 -1471,10 +1476,11 @@@ static void __blk_mq_try_issue_directly
                 .last = true,
         };
         blk_qc_t new_cookie;
- -      int ret;
+ +      blk_status_t ret;
         bool run_queue = true;
   
- -      if (blk_mq_hctx_stopped(hctx)) {
+ +      /* RCU or SRCU read lock is needed before checking quiesced flag */
+ +      if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
                 run_queue = false;
                 goto insert;
         }
@@@ -1499,19 -1493,18 +1499,19 @@@
          * would have done
          */
         ret = q->mq_ops->queue_rq(hctx, &bd);
- -      if (ret == BLK_MQ_RQ_QUEUE_OK) {
+ +      switch (ret) {
+ +      case BLK_STS_OK:
                 *cookie = new_cookie;
                 return;
- -      }
- -
- -      if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+ +      case BLK_STS_RESOURCE:
+ +              __blk_mq_requeue_request(rq);
+ +              goto insert;
+ +      default:
                 *cookie = BLK_QC_T_NONE;
- -              blk_mq_end_request(rq, -EIO);
+ +              blk_mq_end_request(rq, ret);
                 return;
         }
   
- -      __blk_mq_requeue_request(rq);
   insert:
         blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
   }
@@@ -1528,9 -1521,9 +1528,9 @@@ static void blk_mq_try_issue_directly(s
   
                 might_sleep();
   
- -              srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ +              srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
                 __blk_mq_try_issue_directly(hctx, rq, cookie, true);
- -              srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ +              srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
         }
   }
   
@@@ -1548,7 -1541,7 +1548,7 @@@ static blk_qc_t blk_mq_make_request(str
   
         blk_queue_bounce(q, &bio);
   
- -      blk_queue_split(q, &bio, q->bio_split);
+ +      blk_queue_split(q, &bio);
   
         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
                 bio_io_error(bio);
@@@ -1566,11 -1559,9 +1566,11 @@@
   
         trace_block_getrq(q, bio, bio->bi_opf);
   
- -      rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+ +      rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
         if (unlikely(!rq)) {
                 __wbt_done(q->rq_wb, wb_acct);
+ +              if (bio->bi_opf & REQ_NOWAIT)
+ +                      bio_wouldblock_error(bio);
                 return BLK_QC_T_NONE;
         }
   
@@@ -1648,12 -1639,11 +1648,12 @@@
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
                 blk_mq_sched_insert_request(rq, false, true, true, true);
- -      } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+ +      } else {
                 blk_mq_put_ctx(data.ctx);
+ +              blk_mq_bio_to_request(rq, bio);
+ +              blk_mq_queue_io(data.hctx, data.ctx, rq);
                 blk_mq_run_hw_queue(data.hctx, true);
- -      } else
- -              blk_mq_put_ctx(data.ctx);
+ +      }
   
         return cookie;
   }
@@@ -1876,7 -1866,7 +1876,7 @@@ static void blk_mq_exit_hctx(struct req
                 set->ops->exit_hctx(hctx, hctx_idx);
   
         if (hctx->flags & BLK_MQ_F_BLOCKING)
- -              cleanup_srcu_struct(&hctx->queue_rq_srcu);
+ +              cleanup_srcu_struct(hctx->queue_rq_srcu);
   
         blk_mq_remove_cpuhp(hctx);
         blk_free_flush_queue(hctx->fq);
@@@ -1910,6 -1900,7 +1910,6 @@@ static int blk_mq_init_hctx(struct requ
         spin_lock_init(&hctx->lock);
         INIT_LIST_HEAD(&hctx->dispatch);
         hctx->queue = q;
- -      hctx->queue_num = hctx_idx;
         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
   
         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
@@@ -1948,7 -1939,7 +1948,7 @@@
                 goto free_fq;
   
         if (hctx->flags & BLK_MQ_F_BLOCKING)
- -              init_srcu_struct(&hctx->queue_rq_srcu);
+ +              init_srcu_struct(hctx->queue_rq_srcu);
   
         blk_mq_debugfs_register_hctx(q, hctx);
   
@@@ -2233,20 -2224,6 +2233,20 @@@ struct request_queue *blk_mq_init_queue
   }
   EXPORT_SYMBOL(blk_mq_init_queue);
   
+ +static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
+ +{
+ +      int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
+ +
+ +      BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+ +                         __alignof__(struct blk_mq_hw_ctx)) !=
+ +                   sizeof(struct blk_mq_hw_ctx));
+ +
+ +      if (tag_set->flags & BLK_MQ_F_BLOCKING)
+ +              hw_ctx_size += sizeof(struct srcu_struct);
+ +
+ +      return hw_ctx_size;
+ +}
+ +
   static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                                 struct request_queue *q)
   {
@@@ -2261,7 -2238,7 +2261,7 @@@
                         continue;
   
                 node = blk_mq_hw_queue_to_node(q->mq_map, i);
- -              hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+ +              hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
                                         GFP_KERNEL, node);
                 if (!hctxs[i])
                         break;
diff --combined block/kyber-iosched.c

index a9f6fd3fab8e5d154933c11c5491098c0cce3aa2,9bf1484365b21e0dcd2527a90632f09a6c1eb803..f58cab82105ba66e859b08872c6f588512315746
--- 1/block/kyber-iosched.c
--- 2/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@@ -99,7 -99,7 +99,7 @@@ struct kyber_hctx_data 
         struct list_head rqs[KYBER_NUM_DOMAINS];
         unsigned int cur_domain;
         unsigned int batching;
-       wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+       wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
         atomic_t wait_index[KYBER_NUM_DOMAINS];
   };
   
@@@ -385,7 -385,7 +385,7 @@@ static int kyber_init_hctx(struct blk_m
   
         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
                 INIT_LIST_HEAD(&khd->rqs[i]);
-               INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+               INIT_LIST_HEAD(&khd->domain_wait[i].entry);
                 atomic_set(&khd->wait_index[i], 0);
         }
   
@@@ -426,29 -426,33 +426,29 @@@ static void rq_clear_domain_token(struc
         }
   }
   
- -static struct request *kyber_get_request(struct request_queue *q,
- -                                       unsigned int op,
- -                                       struct blk_mq_alloc_data *data)
+ +static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
   {
- -      struct kyber_queue_data *kqd = q->elevator->elevator_data;
- -      struct request *rq;
- -
         /*
          * We use the scheduler tags as per-hardware queue queueing tokens.
          * Async requests can be limited at this stage.
          */
- -      if (!op_is_sync(op))
+ +      if (!op_is_sync(op)) {
+ +              struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
+ +
                 data->shallow_depth = kqd->async_depth;
+ +      }
+ +}
   
- -      rq = __blk_mq_alloc_request(data, op);
- -      if (rq)
- -              rq_set_domain_token(rq, -1);
- -      return rq;
+ +static void kyber_prepare_request(struct request *rq, struct bio *bio)
+ +{
+ +      rq_set_domain_token(rq, -1);
   }
   
- -static void kyber_put_request(struct request *rq)
+ +static void kyber_finish_request(struct request *rq)
   {
- -      struct request_queue *q = rq->q;
- -      struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ +      struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
   
         rq_clear_domain_token(kqd, rq);
- -      blk_mq_finish_request(rq);
   }
   
   static void kyber_completed_request(struct request *rq)
@@@ -503,12 -507,12 +503,12 @@@ static void kyber_flush_busy_ctxs(struc
         }
   }
   
- static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
                              void *key)
   {
         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
   
-       list_del_init(&wait->task_list);
+       list_del_init(&wait->entry);
         blk_mq_run_hw_queue(hctx, true);
         return 1;
   }
@@@ -519,7 -523,7 +519,7 @@@ static int kyber_get_domain_token(struc
   {
         unsigned int sched_domain = khd->cur_domain;
         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
-       wait_queue_t *wait = &khd->domain_wait[sched_domain];
+       wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
         struct sbq_wait_state *ws;
         int nr;
   
@@@ -532,7 -536,7 +532,7 @@@
          * run when one becomes available. Note that this is serialized on
          * khd->lock, but we still need to be careful about the waker.
          */
-       if (list_empty_careful(&wait->task_list)) {
+       if (list_empty_careful(&wait->entry)) {
                 init_waitqueue_func_entry(wait, kyber_domain_wake);
                 wait->private = hctx;
                 ws = sbq_wait_ptr(domain_tokens,
@@@ -730,9 -734,9 +730,9 @@@ static int kyber_##name##_waiting_show(
   {                                                                     \
         struct blk_mq_hw_ctx *hctx = data;                              \
         struct kyber_hctx_data *khd = hctx->sched_data;                 \
-       wait_queue_t *wait = &khd->domain_wait[domain];                 \
+       wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
                                                                         \
-       seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));   \
+       seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
         return 0;                                                       \
   }
   KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
@@@ -811,9 -815,8 +811,9 @@@ static struct elevator_type kyber_sche
                 .exit_sched = kyber_exit_sched,
                 .init_hctx = kyber_init_hctx,
                 .exit_hctx = kyber_exit_hctx,
- -              .get_request = kyber_get_request,
- -              .put_request = kyber_put_request,
+ +              .limit_depth = kyber_limit_depth,
+ +              .prepare_request = kyber_prepare_request,
+ +              .finish_request = kyber_finish_request,
                 .completed_request = kyber_completed_request,
                 .dispatch_request = kyber_dispatch_request,
                 .has_work = kyber_has_work,
diff --combined fs/inode.c

index f0e5fc77e6a4c0b6b571554dd897a5838be0de00,70761d6cafcd7ef97e172b2e84fb9c9412c809a6..ab3b9a795c0b73130968e04e388d8160017106fd
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -146,7 -146,6 +146,7 @@@ int inode_init_always(struct super_bloc
         i_gid_write(inode, 0);
         atomic_set(&inode->i_writecount, 0);
         inode->i_size = 0;
+ +      inode->i_write_hint = WRITE_LIFE_NOT_SET;
         inode->i_blocks = 0;
         inode->i_bytes = 0;
         inode->i_generation = 0;
@@@ -1892,11 -1891,11 +1892,11 @@@ static void __wait_on_freeing_inode(str
         wait_queue_head_t *wq;
         DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
         wq = bit_waitqueue(&inode->i_state, __I_NEW);
-       prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
         spin_unlock(&inode->i_lock);
         spin_unlock(&inode_hash_lock);
         schedule();
-       finish_wait(wq, &wait.wait);
+       finish_wait(wq, &wait.wq_entry);
         spin_lock(&inode_hash_lock);
   }
   
@@@ -2039,11 -2038,11 +2039,11 @@@ static void __inode_dio_wait(struct ino
         DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
   
         do {
-               prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+               prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
                 if (atomic_read(&inode->i_dio_count))
                         schedule();
         } while (atomic_read(&inode->i_dio_count));
-       finish_wait(wq, &q.wait);
+       finish_wait(wq, &q.wq_entry);
   }
   
   /**
diff --combined fs/nfs/nfs4proc.c

index dbfa18900e25a38a0998a2d429644a860c559ac2,be5a8f84e5bb725d01bed5f2e7d08b68e709988a..98b0b662af0995918913a0eb5a4693e695a1f1a3
--- 1/fs/nfs/nfs4proc.c
--- 2/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@@ -2589,8 -2589,7 +2589,8 @@@ static inline void nfs4_exclusive_attrs
   
         /* Except MODE, it seems harmless of setting twice. */
         if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
- -              attrset[1] & FATTR4_WORD1_MODE)
+ +              (attrset[1] & FATTR4_WORD1_MODE ||
+ +               attrset[2] & FATTR4_WORD2_MODE_UMASK))
                 sattr->ia_valid &= ~ATTR_MODE;
   
         if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
@@@ -6373,7 -6372,7 +6373,7 @@@ struct nfs4_lock_waiter 
   };
   
   static int
- nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
   {
         int ret;
         struct cb_notify_lock_args *cbnl = key;
@@@ -6416,7 -6415,7 +6416,7 @@@ nfs4_retry_setlk(struct nfs4_state *sta
                                            .inode = state->inode,
                                            .owner = &owner,
                                            .notified = false };
-       wait_queue_t wait;
+       wait_queue_entry_t wait;
   
         /* Don't bother with waitqueue if we don't expect a callback */
         if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
@@@ -8417,7 -8416,6 +8417,7 @@@ static void nfs4_layoutget_release(voi
         size_t max_pages = max_response_pages(server);
   
         dprintk("--> %s\n", __func__);
+ +      nfs4_sequence_free_slot(&lgp->res.seq_res);
         nfs4_free_pages(lgp->args.layout.pages, max_pages);
         pnfs_put_layout_hdr(NFS_I(inode)->layout);
         put_nfs_open_context(lgp->args.ctx);
@@@ -8492,6 -8490,7 +8492,6 @@@ nfs4_proc_layoutget(struct nfs4_layoutg
         /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
         if (status == 0 && lgp->res.layoutp->len)
                 lseg = pnfs_layout_process(lgp);
- -      nfs4_sequence_free_slot(&lgp->res.seq_res);
         rpc_put_task(task);
         dprintk("<-- %s status=%d\n", __func__, status);
         if (status)
diff --combined include/linux/blk-mq.h

index 23d32ff0b4629f6ae441ca6f697ee17f490fd9f7,95ba83806c5d98cafae113136a24144bdae036c6..14542308d25bd90dd2c1ac5d7a6c283a86bbd0ba
--- 1/include/linux/blk-mq.h
--- 2/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@@ -33,12 -33,14 +33,12 @@@ struct blk_mq_hw_ctx 
         struct blk_mq_ctx       **ctxs;
         unsigned int            nr_ctx;
   
-       wait_queue_t            dispatch_wait;
+       wait_queue_entry_t              dispatch_wait;
         atomic_t                wait_index;
   
         struct blk_mq_tags      *tags;
         struct blk_mq_tags      *sched_tags;
   
- -      struct srcu_struct      queue_rq_srcu;
- -
         unsigned long           queued;
         unsigned long           run;
   #define BLK_MQ_MAX_DISPATCH_ORDER     7
@@@ -60,9 -62,6 +60,9 @@@
         struct dentry           *debugfs_dir;
         struct dentry           *sched_debugfs_dir;
   #endif
+ +
+ +      /* Must be the last member - see also blk_mq_hw_ctx_size(). */
+ +      struct srcu_struct      queue_rq_srcu[0];
   };
   
   struct blk_mq_tag_set {
@@@ -88,8 -87,7 +88,8 @@@ struct blk_mq_queue_data 
         bool last;
   };
   
- -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
+ +typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
+ +              const struct blk_mq_queue_data *);
   typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
   typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
   typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@@ -144,8 -142,6 +144,8 @@@ struct blk_mq_ops 
         init_request_fn         *init_request;
         exit_request_fn         *exit_request;
         reinit_request_fn       *reinit_request;
+ +      /* Called from inside blk_get_request() */
+ +      void (*initialize_rq_fn)(struct request *rq);
   
         map_queues_fn           *map_queues;
   
@@@ -159,6 -155,10 +159,6 @@@
   };
   
   enum {
- -      BLK_MQ_RQ_QUEUE_OK      = 0,    /* queued fine */
- -      BLK_MQ_RQ_QUEUE_BUSY    = 1,    /* requeue IO for later */
- -      BLK_MQ_RQ_QUEUE_ERROR   = 2,    /* end IO with error */
- -
         BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
         BLK_MQ_F_TAG_SHARED     = 1 << 1,
         BLK_MQ_F_SG_MERGE       = 1 << 2,
@@@ -204,10 -204,10 +204,10 @@@ enum 
         BLK_MQ_REQ_INTERNAL     = (1 << 2), /* allocate internal/sched tag */
   };
   
- -struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                 unsigned int flags);
- -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op,
- -              unsigned int flags, unsigned int hctx_idx);
+ +struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
+ +              unsigned int op, unsigned int flags, unsigned int hctx_idx);
   struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
   
   enum {
@@@ -230,8 -230,8 +230,8 @@@ static inline u16 blk_mq_unique_tag_to_
   
   int blk_mq_request_started(struct request *rq);
   void blk_mq_start_request(struct request *rq);
- -void blk_mq_end_request(struct request *rq, int error);
- -void __blk_mq_end_request(struct request *rq, int error);
+ +void blk_mq_end_request(struct request *rq, blk_status_t error);
+ +void __blk_mq_end_request(struct request *rq, blk_status_t error);
   
   void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
   void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
@@@ -247,8 -247,6 +247,8 @@@ void blk_mq_stop_hw_queues(struct reque
   void blk_mq_start_hw_queues(struct request_queue *q);
   void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+ +void blk_mq_quiesce_queue(struct request_queue *q);
+ +void blk_mq_unquiesce_queue(struct request_queue *q);
   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
   void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
   void blk_mq_run_hw_queues(struct request_queue *q, bool async);
@@@ -266,8 -264,6 +266,8 @@@ int blk_mq_reinit_tagset(struct blk_mq_
   int blk_mq_map_queues(struct blk_mq_tag_set *set);
   void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
   
+ +void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+ +
   /*
    * Driver command data is immediately after the request. So subtract request
    * size to get back to the original request, add request size to get the PDU.
diff --combined include/linux/fs.h

index 65adbddb31636559a1e6f65b585b7a6b733f0ae6,53f7e49d8fe5d1e0b5c9184498c13d8bb5fe4b0b..771fe11314672e7c7c26369a9054f96cdb66dc46
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -2,7 -2,7 +2,7 @@@
   #define _LINUX_FS_H
   
   #include <linux/linkage.h>
- #include <linux/wait.h>
+ #include <linux/wait_bit.h>
   #include <linux/kdev_t.h>
   #include <linux/dcache.h>
   #include <linux/path.h>
@@@ -20,7 -20,6 +20,7 @@@
   #include <linux/rwsem.h>
   #include <linux/capability.h>
   #include <linux/semaphore.h>
+ +#include <linux/fcntl.h>
   #include <linux/fiemap.h>
   #include <linux/rculist_bl.h>
   #include <linux/atomic.h>
@@@ -31,7 -30,6 +31,7 @@@
   #include <linux/percpu-rwsem.h>
   #include <linux/workqueue.h>
   #include <linux/delayed_call.h>
+ +#include <linux/uuid.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -144,9 -142,6 +144,9 @@@ typedef int (dio_iodone_t)(struct kioc
   /* File was opened by fanotify and shouldn't generate fanotify events */
   #define FMODE_NONOTIFY                ((__force fmode_t)0x4000000)
   
+ +/* File is capable of returning -EAGAIN if AIO will block */
+ +#define FMODE_AIO_NOWAIT      ((__force fmode_t)0x8000000)
+ +
   /*
    * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
    * that indicates that they should check the contents of the iovec are
@@@ -266,18 -261,6 +266,18 @@@ struct page
   struct address_space;
   struct writeback_control;
   
+ +/*
+ + * Write life time hint values.
+ + */
+ +enum rw_hint {
+ +      WRITE_LIFE_NOT_SET      = 0,
+ +      WRITE_LIFE_NONE         = RWH_WRITE_LIFE_NONE,
+ +      WRITE_LIFE_SHORT        = RWH_WRITE_LIFE_SHORT,
+ +      WRITE_LIFE_MEDIUM       = RWH_WRITE_LIFE_MEDIUM,
+ +      WRITE_LIFE_LONG         = RWH_WRITE_LIFE_LONG,
+ +      WRITE_LIFE_EXTREME      = RWH_WRITE_LIFE_EXTREME,
+ +};
+ +
   #define IOCB_EVENTFD          (1 << 0)
   #define IOCB_APPEND           (1 << 1)
   #define IOCB_DIRECT           (1 << 2)
@@@ -285,7 -268,6 +285,7 @@@
   #define IOCB_DSYNC            (1 << 4)
   #define IOCB_SYNC             (1 << 5)
   #define IOCB_WRITE            (1 << 6)
+ +#define IOCB_NOWAIT           (1 << 7)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -293,7 -275,6 +293,7 @@@
         void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
         void                    *private;
         int                     ki_flags;
+ +      enum rw_hint            ki_hint;
   };
   
   static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@@ -301,6 -282,16 +301,6 @@@
         return kiocb->ki_complete == NULL;
   }
   
- -static inline int iocb_flags(struct file *file);
- -
- -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
- -{
- -      *kiocb = (struct kiocb) {
- -              .ki_filp = filp,
- -              .ki_flags = iocb_flags(filp),
- -      };
- -}
- -
   /*
    * "descriptor" for what we're up to with a read.
    * This allows us to use the same read code yet
@@@ -601,7 -592,6 +601,7 @@@ struct inode 
         spinlock_t              i_lock; /* i_blocks, i_bytes, maybe i_size */
         unsigned short          i_bytes;
         unsigned int            i_blkbits;
+ +      enum rw_hint            i_write_hint;
         blkcnt_t                i_blocks;
   
   #ifdef __NEED_I_SIZE_ORDERED
@@@ -856,7 -846,6 +856,7 @@@ struct file 
          * Must not be taken from IRQ context.
          */
         spinlock_t              f_lock;
+ +      enum rw_hint            f_write_hint;
         atomic_long_t           f_count;
         unsigned int            f_flags;
         fmode_t                 f_mode;
@@@ -1032,6 -1021,8 +1032,6 @@@ struct file_lock_context 
   #define OFFT_OFFSET_MAX       INT_LIMIT(off_t)
   #endif
   
- -#include <linux/fcntl.h>
- -
   extern void send_sigio(struct fown_struct *fown, int fd, int band);
   
   /*
@@@ -1337,8 -1328,8 +1337,8 @@@ struct super_block 
   
         struct sb_writers       s_writers;
   
- -      char s_id[32];                          /* Informational name */
- -      u8 s_uuid[16];                          /* UUID */
+ +      char                    s_id[32];       /* Informational name */
+ +      uuid_t                  s_uuid;         /* UUID */
   
         void                    *s_fs_info;     /* Filesystem private info */
         unsigned int            s_max_links;
@@@ -1882,25 -1873,6 +1882,25 @@@ static inline bool HAS_UNMAPPED_ID(stru
         return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
   }
   
+ +static inline enum rw_hint file_write_hint(struct file *file)
+ +{
+ +      if (file->f_write_hint != WRITE_LIFE_NOT_SET)
+ +              return file->f_write_hint;
+ +
+ +      return file_inode(file)->i_write_hint;
+ +}
+ +
+ +static inline int iocb_flags(struct file *file);
+ +
+ +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+ +{
+ +      *kiocb = (struct kiocb) {
+ +              .ki_filp = filp,
+ +              .ki_flags = iocb_flags(filp),
+ +              .ki_hint = file_write_hint(filp),
+ +      };
+ +}
+ +
   /*
    * Inode state bits.  Protected by inode->i_lock
    *
@@@ -2545,8 -2517,6 +2545,8 @@@ extern int filemap_fdatawait(struct add
   extern void filemap_fdatawait_keep_errors(struct address_space *);
   extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                    loff_t lend);
+ +extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
+ +                                loff_t lend);
   extern int filemap_write_and_wait(struct address_space *mapping);
   extern int filemap_write_and_wait_range(struct address_space *mapping,
                                         loff_t lstart, loff_t lend);
@@@ -2873,7 -2843,7 +2873,7 @@@ enum 
         DIO_SKIP_DIO_COUNT = 0x08,
   };
   
- -void dio_end_io(struct bio *bio, int error);
+ +void dio_end_io(struct bio *bio);
   
   ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                              struct block_device *bdev, struct iov_iter *iter,
@@@ -3086,25 -3056,6 +3086,25 @@@ static inline int iocb_flags(struct fil
         return res;
   }
   
+ +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+ +{
+ +      if (unlikely(flags & ~RWF_SUPPORTED))
+ +              return -EOPNOTSUPP;
+ +
+ +      if (flags & RWF_NOWAIT) {
+ +              if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+ +                      return -EOPNOTSUPP;
+ +              ki->ki_flags |= IOCB_NOWAIT;
+ +      }
+ +      if (flags & RWF_HIPRI)
+ +              ki->ki_flags |= IOCB_HIPRI;
+ +      if (flags & RWF_DSYNC)
+ +              ki->ki_flags |= IOCB_DSYNC;
+ +      if (flags & RWF_SYNC)
+ +              ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+ +      return 0;
+ +}
+ +
   static inline ino_t parent_ino(struct dentry *dentry)
   {
         ino_t res;
diff --combined init/Kconfig

index bc4c180c66a58a3f115fa5dcaa7b03ef9852dd0b,c359038ebeed7bdd48a14ec4f26c6dcc7f195790..ee0f03b69d11ca60170309bffe93b56cd71548fa
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -472,7 -472,354 +472,7 @@@ config TASK_IO_ACCOUNTIN
   
   endmenu # "CPU/Task time and stats accounting"
   
- -menu "RCU Subsystem"
- -
- -config TREE_RCU
- -      bool
- -      default y if !PREEMPT && SMP
- -      help
- -        This option selects the RCU implementation that is
- -        designed for very large SMP system with hundreds or
- -        thousands of CPUs.  It also scales down nicely to
- -        smaller systems.
- -
- -config PREEMPT_RCU
- -      bool
- -      default y if PREEMPT
- -      help
- -        This option selects the RCU implementation that is
- -        designed for very large SMP systems with hundreds or
- -        thousands of CPUs, but for which real-time response
- -        is also required.  It also scales down nicely to
- -        smaller systems.
- -
- -        Select this option if you are unsure.
- -
- -config TINY_RCU
- -      bool
- -      default y if !PREEMPT && !SMP
- -      help
- -        This option selects the RCU implementation that is
- -        designed for UP systems from which real-time response
- -        is not required.  This option greatly reduces the
- -        memory footprint of RCU.
- -
- -config RCU_EXPERT
- -      bool "Make expert-level adjustments to RCU configuration"
- -      default n
- -      help
- -        This option needs to be enabled if you wish to make
- -        expert-level adjustments to RCU configuration.  By default,
- -        no such adjustments can be made, which has the often-beneficial
- -        side-effect of preventing "make oldconfig" from asking you all
- -        sorts of detailed questions about how you would like numerous
- -        obscure RCU options to be set up.
- -
- -        Say Y if you need to make expert-level adjustments to RCU.
- -
- -        Say N if you are unsure.
- -
- -config SRCU
- -      bool
- -      default y
- -      help
- -        This option selects the sleepable version of RCU. This version
- -        permits arbitrary sleeping or blocking within RCU read-side critical
- -        sections.
- -
- -config CLASSIC_SRCU
- -      bool "Use v4.11 classic SRCU implementation"
- -      default n
- -      depends on RCU_EXPERT && SRCU
- -      help
- -        This option selects the traditional well-tested classic SRCU
- -        implementation from v4.11, as might be desired for enterprise
- -        Linux distributions.  Without this option, the shiny new
- -        Tiny SRCU and Tree SRCU implementations are used instead.
- -        At some point, it is hoped that Tiny SRCU and Tree SRCU
- -        will accumulate enough test time and confidence to allow
- -        Classic SRCU to be dropped entirely.
- -
- -        Say Y if you need a rock-solid SRCU.
- -
- -        Say N if you would like help test Tree SRCU.
- -
- -config TINY_SRCU
- -      bool
- -      default y if SRCU && TINY_RCU && !CLASSIC_SRCU
- -      help
- -        This option selects the single-CPU non-preemptible version of SRCU.
- -
- -config TREE_SRCU
- -      bool
- -      default y if SRCU && !TINY_RCU && !CLASSIC_SRCU
- -      help
- -        This option selects the full-fledged version of SRCU.
- -
- -config TASKS_RCU
- -      bool
- -      default n
- -      select SRCU
- -      help
- -        This option enables a task-based RCU implementation that uses
- -        only voluntary context switch (not preemption!), idle, and
- -        user-mode execution as quiescent states.
- -
- -config RCU_STALL_COMMON
- -      def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
- -      help
- -        This option enables RCU CPU stall code that is common between
- -        the TINY and TREE variants of RCU.  The purpose is to allow
- -        the tiny variants to disable RCU CPU stall warnings, while
- -        making these warnings mandatory for the tree variants.
- -
- -config RCU_NEED_SEGCBLIST
- -      def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU )
- -
- -config CONTEXT_TRACKING
- -       bool
- -
- -config CONTEXT_TRACKING_FORCE
- -      bool "Force context tracking"
- -      depends on CONTEXT_TRACKING
- -      default y if !NO_HZ_FULL
- -      help
- -        The major pre-requirement for full dynticks to work is to
- -        support the context tracking subsystem. But there are also
- -        other dependencies to provide in order to make the full
- -        dynticks working.
- -
- -        This option stands for testing when an arch implements the
- -        context tracking backend but doesn't yet fullfill all the
- -        requirements to make the full dynticks feature working.
- -        Without the full dynticks, there is no way to test the support
- -        for context tracking and the subsystems that rely on it: RCU
- -        userspace extended quiescent state and tickless cputime
- -        accounting. This option copes with the absence of the full
- -        dynticks subsystem by forcing the context tracking on all
- -        CPUs in the system.
- -
- -        Say Y only if you're working on the development of an
- -        architecture backend for the context tracking.
- -
- -        Say N otherwise, this option brings an overhead that you
- -        don't want in production.
- -
- -
- -config RCU_FANOUT
- -      int "Tree-based hierarchical RCU fanout value"
- -      range 2 64 if 64BIT
- -      range 2 32 if !64BIT
- -      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- -      default 64 if 64BIT
- -      default 32 if !64BIT
- -      help
- -        This option controls the fanout of hierarchical implementations
- -        of RCU, allowing RCU to work efficiently on machines with
- -        large numbers of CPUs.  This value must be at least the fourth
- -        root of NR_CPUS, which allows NR_CPUS to be insanely large.
- -        The default value of RCU_FANOUT should be used for production
- -        systems, but if you are stress-testing the RCU implementation
- -        itself, small RCU_FANOUT values allow you to test large-system
- -        code paths on small(er) systems.
- -
- -        Select a specific number if testing RCU itself.
- -        Take the default if unsure.
- -
- -config RCU_FANOUT_LEAF
- -      int "Tree-based hierarchical RCU leaf-level fanout value"
- -      range 2 64 if 64BIT
- -      range 2 32 if !64BIT
- -      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- -      default 16
- -      help
- -        This option controls the leaf-level fanout of hierarchical
- -        implementations of RCU, and allows trading off cache misses
- -        against lock contention.  Systems that synchronize their
- -        scheduling-clock interrupts for energy-efficiency reasons will
- -        want the default because the smaller leaf-level fanout keeps
- -        lock contention levels acceptably low.  Very large systems
- -        (hundreds or thousands of CPUs) will instead want to set this
- -        value to the maximum value possible in order to reduce the
- -        number of cache misses incurred during RCU's grace-period
- -        initialization.  These systems tend to run CPU-bound, and thus
- -        are not helped by synchronized interrupts, and thus tend to
- -        skew them, which reduces lock contention enough that large
- -        leaf-level fanouts work well.  That said, setting leaf-level
- -        fanout to a large number will likely cause problematic
- -        lock contention on the leaf-level rcu_node structures unless
- -        you boot with the skew_tick kernel parameter.
- -
- -        Select a specific number if testing RCU itself.
- -
- -        Select the maximum permissible value for large systems, but
- -        please understand that you may also need to set the skew_tick
- -        kernel boot parameter to avoid contention on the rcu_node
- -        structure's locks.
- -
- -        Take the default if unsure.
- -
- -config RCU_FAST_NO_HZ
- -      bool "Accelerate last non-dyntick-idle CPU's grace periods"
- -      depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- -      default n
- -      help
- -        This option permits CPUs to enter dynticks-idle state even if
- -        they have RCU callbacks queued, and prevents RCU from waking
- -        these CPUs up more than roughly once every four jiffies (by
- -        default, you can adjust this using the rcutree.rcu_idle_gp_delay
- -        parameter), thus improving energy efficiency.  On the other
- -        hand, this option increases the duration of RCU grace periods,
- -        for example, slowing down synchronize_rcu().
- -
- -        Say Y if energy efficiency is critically important, and you
- -              don't care about increased grace-period durations.
- -
- -        Say N if you are unsure.
- -
- -config TREE_RCU_TRACE
- -      def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU )
- -      select DEBUG_FS
- -      help
- -        This option provides tracing for the TREE_RCU and
- -        PREEMPT_RCU implementations, permitting Makefile to
- -        trivially select kernel/rcutree_trace.c.
- -
- -config RCU_BOOST
- -      bool "Enable RCU priority boosting"
- -      depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- -      default n
- -      help
- -        This option boosts the priority of preempted RCU readers that
- -        block the current preemptible RCU grace period for too long.
- -        This option also prevents heavy loads from blocking RCU
- -        callback invocation for all flavors of RCU.
- -
- -        Say Y here if you are working with real-time apps or heavy loads
- -        Say N here if you are unsure.
- -
- -config RCU_KTHREAD_PRIO
- -      int "Real-time priority to use for RCU worker threads"
- -      range 1 99 if RCU_BOOST
- -      range 0 99 if !RCU_BOOST
- -      default 1 if RCU_BOOST
- -      default 0 if !RCU_BOOST
- -      depends on RCU_EXPERT
- -      help
- -        This option specifies the SCHED_FIFO priority value that will be
- -        assigned to the rcuc/n and rcub/n threads and is also the value
- -        used for RCU_BOOST (if enabled). If you are working with a
- -        real-time application that has one or more CPU-bound threads
- -        running at a real-time priority level, you should set
- -        RCU_KTHREAD_PRIO to a priority higher than the highest-priority
- -        real-time CPU-bound application thread.  The default RCU_KTHREAD_PRIO
- -        value of 1 is appropriate in the common case, which is real-time
- -        applications that do not have any CPU-bound threads.
- -
- -        Some real-time applications might not have a single real-time
- -        thread that saturates a given CPU, but instead might have
- -        multiple real-time threads that, taken together, fully utilize
- -        that CPU.  In this case, you should set RCU_KTHREAD_PRIO to
- -        a priority higher than the lowest-priority thread that is
- -        conspiring to prevent the CPU from running any non-real-time
- -        tasks.  For example, if one thread at priority 10 and another
- -        thread at priority 5 are between themselves fully consuming
- -        the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be
- -        set to priority 6 or higher.
- -
- -        Specify the real-time priority, or take the default if unsure.
- -
- -config RCU_BOOST_DELAY
- -      int "Milliseconds to delay boosting after RCU grace-period start"
- -      range 0 3000
- -      depends on RCU_BOOST
- -      default 500
- -      help
- -        This option specifies the time to wait after the beginning of
- -        a given grace period before priority-boosting preempted RCU
- -        readers blocking that grace period.  Note that any RCU reader
- -        blocking an expedited RCU grace period is boosted immediately.
- -
- -        Accept the default if unsure.
- -
- -config RCU_NOCB_CPU
- -      bool "Offload RCU callback processing from boot-selected CPUs"
- -      depends on TREE_RCU || PREEMPT_RCU
- -      depends on RCU_EXPERT || NO_HZ_FULL
- -      default n
- -      help
- -        Use this option to reduce OS jitter for aggressive HPC or
- -        real-time workloads.  It can also be used to offload RCU
- -        callback invocation to energy-efficient CPUs in battery-powered
- -        asymmetric multiprocessors.
- -
- -        This option offloads callback invocation from the set of
- -        CPUs specified at boot time by the rcu_nocbs parameter.
- -        For each such CPU, a kthread ("rcuox/N") will be created to
- -        invoke callbacks, where the "N" is the CPU being offloaded,
- -        and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
- -        "s" for RCU-sched.  Nothing prevents this kthread from running
- -        on the specified CPUs, but (1) the kthreads may be preempted
- -        between each callback, and (2) affinity or cgroups can be used
- -        to force the kthreads to run on whatever set of CPUs is desired.
- -
- -        Say Y here if you want to help to debug reduced OS jitter.
- -        Say N here if you are unsure.
- -
- -choice
- -      prompt "Build-forced no-CBs CPUs"
- -      default RCU_NOCB_CPU_NONE
- -      depends on RCU_NOCB_CPU
- -      help
- -        This option allows no-CBs CPUs (whose RCU callbacks are invoked
- -        from kthreads rather than from softirq context) to be specified
- -        at build time.  Additional no-CBs CPUs may be specified by
- -        the rcu_nocbs= boot parameter.
- -
- -config RCU_NOCB_CPU_NONE
- -      bool "No build_forced no-CBs CPUs"
- -      help
- -        This option does not force any of the CPUs to be no-CBs CPUs.
- -        Only CPUs designated by the rcu_nocbs= boot parameter will be
- -        no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU
- -        kthreads whose names begin with "rcuo".  All other CPUs will
- -        invoke their own RCU callbacks in softirq context.
- -
- -        Select this option if you want to choose no-CBs CPUs at
- -        boot time, for example, to allow testing of different no-CBs
- -        configurations without having to rebuild the kernel each time.
- -
- -config RCU_NOCB_CPU_ZERO
- -      bool "CPU 0 is a build_forced no-CBs CPU"
- -      help
- -        This option forces CPU 0 to be a no-CBs CPU, so that its RCU
- -        callbacks are invoked by a per-CPU kthread whose name begins
- -        with "rcuo".  Additional CPUs may be designated as no-CBs
- -        CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs.
- -        All other CPUs will invoke their own RCU callbacks in softirq
- -        context.
- -
- -        Select this if CPU 0 needs to be a no-CBs CPU for real-time
- -        or energy-efficiency reasons, but the real reason it exists
- -        is to ensure that randconfig testing covers mixed systems.
- -
- -config RCU_NOCB_CPU_ALL
- -      bool "All CPUs are build_forced no-CBs CPUs"
- -      help
- -        This option forces all CPUs to be no-CBs CPUs.  The rcu_nocbs=
- -        boot parameter will be ignored.  All CPUs' RCU callbacks will
- -        be executed in the context of per-CPU rcuo kthreads created for
- -        this purpose.  Assuming that the kthreads whose names start with
- -        "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter
- -        on the remaining CPUs, but might decrease memory locality during
- -        RCU-callback invocation, thus potentially degrading throughput.
- -
- -        Select this if all CPUs need to be no-CBs CPUs for real-time
- -        or energy-efficiency reasons.
- -
- -endchoice
- -
- -endmenu # "RCU Subsystem"
+ +source "kernel/rcu/Kconfig"
   
   config BUILD_BIN2C
         bool
@@@ -809,6 -1156,7 +809,7 @@@ config CGROUP_HUGETL
   
   config CPUSETS
         bool "Cpuset controller"
+       depends on SMP
         help
           This option will let you create and manage CPUSETs which
           allow dynamically partitioning a system into sets of CPUs and
diff --combined kernel/sched/core.c

index 5b60f3a8343f38541c0157b681d26d4a24466f79,5186797908dc261eb4132271b2ced312708a12c9..17c667b427b4a570611272f2906a8bf50778a7ce
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -10,6 -10,7 +10,7 @@@
   #include <uapi/linux/sched/types.h>
   #include <linux/sched/loadavg.h>
   #include <linux/sched/hotplug.h>
+ #include <linux/wait_bit.h>
   #include <linux/cpuset.h>
   #include <linux/delayacct.h>
   #include <linux/init_task.h>
@@@ -788,36 -789,6 +789,6 @@@ void deactivate_task(struct rq *rq, str
         dequeue_task(rq, p, flags);
   }
   
- void sched_set_stop_task(int cpu, struct task_struct *stop)
- {
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
-       struct task_struct *old_stop = cpu_rq(cpu)->stop;
- 
-       if (stop) {
-               /*
-                * Make it appear like a SCHED_FIFO task, its something
-                * userspace knows about and won't get confused about.
-                *
-                * Also, it will make PI more or less work without too
-                * much confusion -- but then, stop work should not
-                * rely on PI working anyway.
-                */
-               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
- 
-               stop->sched_class = &stop_sched_class;
-       }
- 
-       cpu_rq(cpu)->stop = stop;
- 
-       if (old_stop) {
-               /*
-                * Reset it back to a normal scheduling class so that
-                * it can die in pieces.
-                */
-               old_stop->sched_class = &rt_sched_class;
-       }
- }
- 
   /*
    * __normal_prio - return the priority that is based on the static prio
    */
@@@ -1588,6 -1559,36 +1559,36 @@@ static void update_avg(u64 *avg, u64 sa
         *avg += diff >> 3;
   }
   
+ void sched_set_stop_task(int cpu, struct task_struct *stop)
+ {
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+ 
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+ 
+               stop->sched_class = &stop_sched_class;
+       }
+ 
+       cpu_rq(cpu)->stop = stop;
+ 
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+ }
+ 
   #else
   
   static inline int __set_cpus_allowed_ptr(struct task_struct *p,
@@@ -1731,7 -1732,7 +1732,7 @@@ void sched_ttwu_pending(void
   {
         struct rq *rq = this_rq();
         struct llist_node *llist = llist_del_all(&rq->wake_list);
-       struct task_struct *p;
+       struct task_struct *p, *t;
         struct rq_flags rf;
   
         if (!llist)
@@@ -1740,17 -1741,8 +1741,8 @@@
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
   
-       while (llist) {
-               int wake_flags = 0;
- 
-               p = llist_entry(llist, struct task_struct, wake_entry);
-               llist = llist_next(llist);
- 
-               if (p->sched_remote_wakeup)
-                       wake_flags = WF_MIGRATED;
- 
-               ttwu_do_activate(rq, p, wake_flags, &rf);
-       }
+       llist_for_each_entry_safe(p, t, llist, wake_entry)
+               ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
   
         rq_unlock_irqrestore(rq, &rf);
   }
@@@ -2147,23 -2139,6 +2139,6 @@@ int wake_up_state(struct task_struct *p
         return try_to_wake_up(p, state, 0);
   }
   
- /*
-  * This function clears the sched_dl_entity static params.
-  */
- void __dl_clear_params(struct task_struct *p)
- {
-       struct sched_dl_entity *dl_se = &p->dl;
- 
-       dl_se->dl_runtime = 0;
-       dl_se->dl_deadline = 0;
-       dl_se->dl_period = 0;
-       dl_se->flags = 0;
-       dl_se->dl_bw = 0;
- 
-       dl_se->dl_throttled = 0;
-       dl_se->dl_yielded = 0;
- }
- 
   /*
    * Perform scheduler related setup for a newly forked process p.
    * p is forked by current.
@@@ -2193,6 -2168,7 +2168,7 @@@ static void __sched_fork(unsigned long 
   
         RB_CLEAR_NODE(&p->dl.rb_node);
         init_dl_task_timer(&p->dl);
+       init_dl_inactive_task_timer(&p->dl);
         __dl_clear_params(p);
   
         INIT_LIST_HEAD(&p->rt.run_list);
@@@ -2430,7 -2406,7 +2406,7 @@@ int sched_fork(unsigned long clone_flag
   unsigned long to_ratio(u64 period, u64 runtime)
   {
         if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
+               return BW_UNIT;
   
         /*
          * Doing this here saves a lot of checks in all
@@@ -2440,93 -2416,9 +2416,9 @@@
         if (period == 0)
                 return 0;
   
-       return div64_u64(runtime << 20, period);
- }
- 
- #ifdef CONFIG_SMP
- inline struct dl_bw *dl_bw_of(int i)
- {
-       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-                        "sched RCU must be held");
-       return &cpu_rq(i)->rd->dl_bw;
+       return div64_u64(runtime << BW_SHIFT, period);
   }
   
- static inline int dl_bw_cpus(int i)
- {
-       struct root_domain *rd = cpu_rq(i)->rd;
-       int cpus = 0;
- 
-       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-                        "sched RCU must be held");
-       for_each_cpu_and(i, rd->span, cpu_active_mask)
-               cpus++;
- 
-       return cpus;
- }
- #else
- inline struct dl_bw *dl_bw_of(int i)
- {
-       return &cpu_rq(i)->dl.dl_bw;
- }
- 
- static inline int dl_bw_cpus(int i)
- {
-       return 1;
- }
- #endif
- 
- /*
-  * We must be sure that accepting a new task (or allowing changing the
-  * parameters of an existing one) is consistent with the bandwidth
-  * constraints. If yes, this function also accordingly updates the currently
-  * allocated bandwidth to reflect the new situation.
-  *
-  * This function is called while holding p's rq->lock.
-  *
-  * XXX we should delay bw change until the task's 0-lag point, see
-  * __setparam_dl().
-  */
- static int dl_overflow(struct task_struct *p, int policy,
-                      const struct sched_attr *attr)
- {
- 
-       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-       u64 period = attr->sched_period ?: attr->sched_deadline;
-       u64 runtime = attr->sched_runtime;
-       u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
-       int cpus, err = -1;
- 
-       /* !deadline task may carry old deadline bandwidth */
-       if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
-               return 0;
- 
-       /*
-        * Either if a task, enters, leave, or stays -deadline but changes
-        * its parameters, we may need to update accordingly the total
-        * allocated bandwidth of the container.
-        */
-       raw_spin_lock(&dl_b->lock);
-       cpus = dl_bw_cpus(task_cpu(p));
-       if (dl_policy(policy) && !task_has_dl_policy(p) &&
-           !__dl_overflow(dl_b, cpus, 0, new_bw)) {
-               __dl_add(dl_b, new_bw);
-               err = 0;
-       } else if (dl_policy(policy) && task_has_dl_policy(p) &&
-                  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
-               __dl_clear(dl_b, p->dl.dl_bw);
-               __dl_add(dl_b, new_bw);
-               err = 0;
-       } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
-               __dl_clear(dl_b, p->dl.dl_bw);
-               err = 0;
-       }
-       raw_spin_unlock(&dl_b->lock);
- 
-       return err;
- }
- 
- extern void init_dl_bw(struct dl_bw *dl_b);
- 
   /*
    * wake_up_new_task - wake up a newly created task for the first time.
    *
@@@ -3687,7 -3579,7 +3579,7 @@@ asmlinkage __visible void __sched preem
         exception_exit(prev_state);
   }
   
- int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
                           void *key)
   {
         return try_to_wake_up(curr->private, mode, wake_flags);
@@@ -4008,46 -3900,6 +3900,6 @@@ static struct task_struct *find_process
         return pid ? find_task_by_vpid(pid) : current;
   }
   
- /*
-  * This function initializes the sched_dl_entity of a newly becoming
-  * SCHED_DEADLINE task.
-  *
-  * Only the static values are considered here, the actual runtime and the
-  * absolute deadline will be properly calculated when the task is enqueued
-  * for the first time with its new policy.
-  */
- static void
- __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
- {
-       struct sched_dl_entity *dl_se = &p->dl;
- 
-       dl_se->dl_runtime = attr->sched_runtime;
-       dl_se->dl_deadline = attr->sched_deadline;
-       dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
-       dl_se->flags = attr->sched_flags;
-       dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- 
-       /*
-        * Changing the parameters of a task is 'tricky' and we're not doing
-        * the correct thing -- also see task_dead_dl() and switched_from_dl().
-        *
-        * What we SHOULD do is delay the bandwidth release until the 0-lag
-        * point. This would include retaining the task_struct until that time
-        * and change dl_overflow() to not immediately decrement the current
-        * amount.
-        *
-        * Instead we retain the current runtime/deadline and let the new
-        * parameters take effect after the current reservation period lapses.
-        * This is safe (albeit pessimistic) because the 0-lag point is always
-        * before the current scheduling deadline.
-        *
-        * We can still have temporary overloads because we do not delay the
-        * change in bandwidth until that time; so admission control is
-        * not on the safe side. It does however guarantee tasks will never
-        * consume more than promised.
-        */
- }
- 
   /*
    * sched_setparam() passes in -1 for its policy, to let the functions
    * it calls know not to change it.
@@@ -4101,59 -3953,6 +3953,6 @@@ static void __setscheduler(struct rq *r
                 p->sched_class = &fair_sched_class;
   }
   
- static void
- __getparam_dl(struct task_struct *p, struct sched_attr *attr)
- {
-       struct sched_dl_entity *dl_se = &p->dl;
- 
-       attr->sched_priority = p->rt_priority;
-       attr->sched_runtime = dl_se->dl_runtime;
-       attr->sched_deadline = dl_se->dl_deadline;
-       attr->sched_period = dl_se->dl_period;
-       attr->sched_flags = dl_se->flags;
- }
- 
- /*
-  * This function validates the new parameters of a -deadline task.
-  * We ask for the deadline not being zero, and greater or equal
-  * than the runtime, as well as the period of being zero or
-  * greater than deadline. Furthermore, we have to be sure that
-  * user parameters are above the internal resolution of 1us (we
-  * check sched_runtime only since it is always the smaller one) and
-  * below 2^63 ns (we have to check both sched_deadline and
-  * sched_period, as the latter can be zero).
-  */
- static bool
- __checkparam_dl(const struct sched_attr *attr)
- {
-       /* deadline != 0 */
-       if (attr->sched_deadline == 0)
-               return false;
- 
-       /*
-        * Since we truncate DL_SCALE bits, make sure we're at least
-        * that big.
-        */
-       if (attr->sched_runtime < (1ULL << DL_SCALE))
-               return false;
- 
-       /*
-        * Since we use the MSB for wrap-around and sign issues, make
-        * sure it's not set (mind that period can be equal to zero).
-        */
-       if (attr->sched_deadline & (1ULL << 63) ||
-           attr->sched_period & (1ULL << 63))
-               return false;
- 
-       /* runtime <= deadline <= period (if period != 0) */
-       if ((attr->sched_period != 0 &&
-            attr->sched_period < attr->sched_deadline) ||
-           attr->sched_deadline < attr->sched_runtime)
-               return false;
- 
-       return true;
- }
- 
   /*
    * Check the target process has a UID that matches the current process's:
    */
@@@ -4170,19 -3969,6 +3969,6 @@@ static bool check_same_owner(struct tas
         return match;
   }
   
- static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
- {
-       struct sched_dl_entity *dl_se = &p->dl;
- 
-       if (dl_se->dl_runtime != attr->sched_runtime ||
-               dl_se->dl_deadline != attr->sched_deadline ||
-               dl_se->dl_period != attr->sched_period ||
-               dl_se->flags != attr->sched_flags)
-               return true;
- 
-       return false;
- }
- 
   static int __sched_setscheduler(struct task_struct *p,
                                 const struct sched_attr *attr,
                                 bool user, bool pi)
@@@ -4197,8 -3983,8 +3983,8 @@@
         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         struct rq *rq;
   
-       /* May grab non-irq protected spin_locks: */
-       BUG_ON(in_interrupt());
+       /* The pi code expects interrupts enabled */
+       BUG_ON(pi && in_interrupt());
   recheck:
         /* Double check policy once rq lock held: */
         if (policy < 0) {
@@@ -4211,7 -3997,8 +3997,8 @@@
                         return -EINVAL;
         }
   
-       if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+       if (attr->sched_flags &
+               ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
                 return -EINVAL;
   
         /*
@@@ -4362,7 -4149,7 +4149,7 @@@ change
          * of a SCHED_DEADLINE task) we need to check if enough bandwidth
          * is available.
          */
-       if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+       if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
                 task_rq_unlock(rq, p, &rf);
                 return -EBUSY;
         }
@@@ -5463,26 -5250,17 +5250,17 @@@ void init_idle(struct task_struct *idle
   #endif
   }
   
+ #ifdef CONFIG_SMP
+ 
   int cpuset_cpumask_can_shrink(const struct cpumask *cur,
                               const struct cpumask *trial)
   {
-       int ret = 1, trial_cpus;
-       struct dl_bw *cur_dl_b;
-       unsigned long flags;
+       int ret = 1;
   
         if (!cpumask_weight(cur))
                 return ret;
   
-       rcu_read_lock_sched();
-       cur_dl_b = dl_bw_of(cpumask_any(cur));
-       trial_cpus = cpumask_weight(trial);
- 
-       raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
-       if (cur_dl_b->bw != -1 &&
-           cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
-               ret = 0;
-       raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
-       rcu_read_unlock_sched();
+       ret = dl_cpuset_cpumask_can_shrink(cur, trial);
   
         return ret;
   }
@@@ -5506,43 -5284,14 +5284,14 @@@ int task_can_attach(struct task_struct 
                 goto out;
         }
   
- #ifdef CONFIG_SMP
         if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-                                             cs_cpus_allowed)) {
-               unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
-                                                       cs_cpus_allowed);
-               struct dl_bw *dl_b;
-               bool overflow;
-               int cpus;
-               unsigned long flags;
- 
-               rcu_read_lock_sched();
-               dl_b = dl_bw_of(dest_cpu);
-               raw_spin_lock_irqsave(&dl_b->lock, flags);
-               cpus = dl_bw_cpus(dest_cpu);
-               overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
-               if (overflow)
-                       ret = -EBUSY;
-               else {
-                       /*
-                        * We reserve space for this task in the destination
-                        * root_domain, as we can't fail after this point.
-                        * We will free resources in the source root_domain
-                        * later on (see set_cpus_allowed_dl()).
-                        */
-                       __dl_add(dl_b, p->dl.dl_bw);
-               }
-               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-               rcu_read_unlock_sched();
+                                             cs_cpus_allowed))
+               ret = dl_task_can_attach(p, cs_cpus_allowed);
   
-       }
- #endif
   out:
         return ret;
   }
   
- #ifdef CONFIG_SMP
- 
   bool sched_smp_initialized __read_mostly;
   
   #ifdef CONFIG_NUMA_BALANCING
@@@ -5805,23 -5554,8 +5554,8 @@@ static void cpuset_cpu_active(void
   
   static int cpuset_cpu_inactive(unsigned int cpu)
   {
-       unsigned long flags;
-       struct dl_bw *dl_b;
-       bool overflow;
-       int cpus;
- 
         if (!cpuhp_tasks_frozen) {
-               rcu_read_lock_sched();
-               dl_b = dl_bw_of(cpu);
- 
-               raw_spin_lock_irqsave(&dl_b->lock, flags);
-               cpus = dl_bw_cpus(cpu);
-               overflow = __dl_overflow(dl_b, cpus, 0, 0);
-               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- 
-               rcu_read_unlock_sched();
- 
-               if (overflow)
+               if (dl_cpu_busy(cpu))
                         return -EBUSY;
                 cpuset_update_active_cpus();
         } else {
@@@ -5874,9 -5608,15 +5608,9 @@@ int sched_cpu_deactivate(unsigned int c
          * users of this state to go away such that all new such users will
          * observe it.
          *
- -       * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- -       * not imply sync_sched(), so wait for both.
- -       *
          * Do sync before park smpboot threads to take care the rcu boost case.
          */
- -      if (IS_ENABLED(CONFIG_PREEMPT))
- -              synchronize_rcu_mult(call_rcu, call_rcu_sched);
- -      else
- -              synchronize_rcu();
+ +      synchronize_rcu_mult(call_rcu, call_rcu_sched);
   
         if (!sched_smp_initialized)
                 return 0;
@@@ -5952,7 -5692,6 +5686,6 @@@ void __init sched_init_smp(void
         cpumask_var_t non_isolated_cpus;
   
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
         sched_init_numa();
   
@@@ -5962,7 -5701,7 +5695,7 @@@
          * happen.
          */
         mutex_lock(&sched_domains_mutex);
-       init_sched_domains(cpu_active_mask);
+       sched_init_domains(cpu_active_mask);
         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
         if (cpumask_empty(non_isolated_cpus))
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@@ -5978,7 -5717,6 +5711,6 @@@
         init_sched_dl_class();
   
         sched_init_smt();
-       sched_clock_init_late();
   
         sched_smp_initialized = true;
   }
@@@ -5994,7 -5732,6 +5726,6 @@@ early_initcall(migration_init)
   void __init sched_init_smp(void)
   {
         sched_init_granularity();
-       sched_clock_init_late();
   }
   #endif /* CONFIG_SMP */
   
@@@ -6020,28 -5757,13 +5751,13 @@@ static struct kmem_cache *task_group_ca
   DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
   DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
   
- #define WAIT_TABLE_BITS 8
- #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
- static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
- 
- wait_queue_head_t *bit_waitqueue(void *word, int bit)
- {
-       const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-       unsigned long val = (unsigned long)word << shift | bit;
- 
-       return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
- }
- EXPORT_SYMBOL(bit_waitqueue);
- 
   void __init sched_init(void)
   {
         int i, j;
         unsigned long alloc_size = 0, ptr;
   
         sched_clock_init();
- 
-       for (i = 0; i < WAIT_TABLE_SIZE; i++)
-               init_waitqueue_head(bit_wait_table + i);
+       wait_bit_init();
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@@ -6193,7 -5915,6 +5909,6 @@@
         calc_load_update = jiffies + LOAD_FREQ;
   
   #ifdef CONFIG_SMP
-       zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@@ -6245,8 -5966,10 +5960,10 @@@ void ___might_sleep(const char *file, i
   
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
              !is_idle_task(current)) ||
-           system_state != SYSTEM_RUNNING || oops_in_progress)
+           system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+           oops_in_progress)
                 return;
+ 
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
         prev_jiffy = jiffies;
@@@ -6501,385 -6224,6 +6218,6 @@@ void sched_move_task(struct task_struc
   
         task_rq_unlock(rq, tsk, &rf);
   }
- #endif /* CONFIG_CGROUP_SCHED */
- 
- #ifdef CONFIG_RT_GROUP_SCHED
- /*
-  * Ensure that the real time constraints are schedulable.
-  */
- static DEFINE_MUTEX(rt_constraints_mutex);
- 
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
- {
-       struct task_struct *g, *p;
- 
-       /*
-        * Autogroups do not have RT tasks; see autogroup_create().
-        */
-       if (task_group_is_autogroup(tg))
-               return 0;
- 
-       for_each_process_thread(g, p) {
-               if (rt_task(p) && task_group(p) == tg)
-                       return 1;
-       }
- 
-       return 0;
- }
- 
- struct rt_schedulable_data {
-       struct task_group *tg;
-       u64 rt_period;
-       u64 rt_runtime;
- };
- 
- static int tg_rt_schedulable(struct task_group *tg, void *data)
- {
-       struct rt_schedulable_data *d = data;
-       struct task_group *child;
-       unsigned long total, sum = 0;
-       u64 period, runtime;
- 
-       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       runtime = tg->rt_bandwidth.rt_runtime;
- 
-       if (tg == d->tg) {
-               period = d->rt_period;
-               runtime = d->rt_runtime;
-       }
- 
-       /*
-        * Cannot have more runtime than the period.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
- 
-       /*
-        * Ensure we don't starve existing RT tasks.
-        */
-       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
-               return -EBUSY;
- 
-       total = to_ratio(period, runtime);
- 
-       /*
-        * Nobody can have more than the global setting allows.
-        */
-       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
-               return -EINVAL;
- 
-       /*
-        * The sum of our children's runtime should not exceed our own.
-        */
-       list_for_each_entry_rcu(child, &tg->children, siblings) {
-               period = ktime_to_ns(child->rt_bandwidth.rt_period);
-               runtime = child->rt_bandwidth.rt_runtime;
- 
-               if (child == d->tg) {
-                       period = d->rt_period;
-                       runtime = d->rt_runtime;
-               }
- 
-               sum += to_ratio(period, runtime);
-       }
- 
-       if (sum > total)
-               return -EINVAL;
- 
-       return 0;
- }
- 
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
-       int ret;
- 
-       struct rt_schedulable_data data = {
-               .tg = tg,
-               .rt_period = period,
-               .rt_runtime = runtime,
-       };
- 
-       rcu_read_lock();
-       ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
-       rcu_read_unlock();
- 
-       return ret;
- }
- 
- static int tg_set_rt_bandwidth(struct task_group *tg,
-               u64 rt_period, u64 rt_runtime)
- {
-       int i, err = 0;
- 
-       /*
-        * Disallowing the root group RT runtime is BAD, it would disallow the
-        * kernel creating (and or operating) RT threads.
-        */
-       if (tg == &root_task_group && rt_runtime == 0)
-               return -EINVAL;
- 
-       /* No period doesn't make any sense. */
-       if (rt_period == 0)
-               return -EINVAL;
- 
-       mutex_lock(&rt_constraints_mutex);
-       read_lock(&tasklist_lock);
-       err = __rt_schedulable(tg, rt_period, rt_runtime);
-       if (err)
-               goto unlock;
- 
-       raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-       tg->rt_bandwidth.rt_runtime = rt_runtime;
- 
-       for_each_possible_cpu(i) {
-               struct rt_rq *rt_rq = tg->rt_rq[i];
- 
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               rt_rq->rt_runtime = rt_runtime;
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-       }
-       raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
-       read_unlock(&tasklist_lock);
-       mutex_unlock(&rt_constraints_mutex);
- 
-       return err;
- }
- 
- static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
- {
-       u64 rt_runtime, rt_period;
- 
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
-       if (rt_runtime_us < 0)
-               rt_runtime = RUNTIME_INF;
- 
-       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
- }
- 
- static long sched_group_rt_runtime(struct task_group *tg)
- {
-       u64 rt_runtime_us;
- 
-       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
-               return -1;
- 
-       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
-       do_div(rt_runtime_us, NSEC_PER_USEC);
-       return rt_runtime_us;
- }
- 
- static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
- {
-       u64 rt_runtime, rt_period;
- 
-       rt_period = rt_period_us * NSEC_PER_USEC;
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
- 
-       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
- }
- 
- static long sched_group_rt_period(struct task_group *tg)
- {
-       u64 rt_period_us;
- 
-       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       do_div(rt_period_us, NSEC_PER_USEC);
-       return rt_period_us;
- }
- #endif /* CONFIG_RT_GROUP_SCHED */
- 
- #ifdef CONFIG_RT_GROUP_SCHED
- static int sched_rt_global_constraints(void)
- {
-       int ret = 0;
- 
-       mutex_lock(&rt_constraints_mutex);
-       read_lock(&tasklist_lock);
-       ret = __rt_schedulable(NULL, 0, 0);
-       read_unlock(&tasklist_lock);
-       mutex_unlock(&rt_constraints_mutex);
- 
-       return ret;
- }
- 
- static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
- {
-       /* Don't accept realtime tasks when there is no way for them to run */
-       if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
-               return 0;
- 
-       return 1;
- }
- 
- #else /* !CONFIG_RT_GROUP_SCHED */
- static int sched_rt_global_constraints(void)
- {
-       unsigned long flags;
-       int i;
- 
-       raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-       for_each_possible_cpu(i) {
-               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
- 
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               rt_rq->rt_runtime = global_rt_runtime();
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-       }
-       raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- 
-       return 0;
- }
- #endif /* CONFIG_RT_GROUP_SCHED */
- 
- static int sched_dl_global_validate(void)
- {
-       u64 runtime = global_rt_runtime();
-       u64 period = global_rt_period();
-       u64 new_bw = to_ratio(period, runtime);
-       struct dl_bw *dl_b;
-       int cpu, ret = 0;
-       unsigned long flags;
- 
-       /*
-        * Here we want to check the bandwidth not being set to some
-        * value smaller than the currently allocated bandwidth in
-        * any of the root_domains.
-        *
-        * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-        * cycling on root_domains... Discussion on different/better
-        * solutions is welcome!
-        */
-       for_each_possible_cpu(cpu) {
-               rcu_read_lock_sched();
-               dl_b = dl_bw_of(cpu);
- 
-               raw_spin_lock_irqsave(&dl_b->lock, flags);
-               if (new_bw < dl_b->total_bw)
-                       ret = -EBUSY;
-               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- 
-               rcu_read_unlock_sched();
- 
-               if (ret)
-                       break;
-       }
- 
-       return ret;
- }
- 
- static void sched_dl_do_global(void)
- {
-       u64 new_bw = -1;
-       struct dl_bw *dl_b;
-       int cpu;
-       unsigned long flags;
- 
-       def_dl_bandwidth.dl_period = global_rt_period();
-       def_dl_bandwidth.dl_runtime = global_rt_runtime();
- 
-       if (global_rt_runtime() != RUNTIME_INF)
-               new_bw = to_ratio(global_rt_period(), global_rt_runtime());
- 
-       /*
-        * FIXME: As above...
-        */
-       for_each_possible_cpu(cpu) {
-               rcu_read_lock_sched();
-               dl_b = dl_bw_of(cpu);
- 
-               raw_spin_lock_irqsave(&dl_b->lock, flags);
-               dl_b->bw = new_bw;
-               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- 
-               rcu_read_unlock_sched();
-       }
- }
- 
- static int sched_rt_global_validate(void)
- {
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
- 
-       if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
-               (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
-               return -EINVAL;
- 
-       return 0;
- }
- 
- static void sched_rt_do_global(void)
- {
-       def_rt_bandwidth.rt_runtime = global_rt_runtime();
-       def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
- }
- 
- int sched_rt_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
- {
-       int old_period, old_runtime;
-       static DEFINE_MUTEX(mutex);
-       int ret;
- 
-       mutex_lock(&mutex);
-       old_period = sysctl_sched_rt_period;
-       old_runtime = sysctl_sched_rt_runtime;
- 
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
- 
-       if (!ret && write) {
-               ret = sched_rt_global_validate();
-               if (ret)
-                       goto undo;
- 
-               ret = sched_dl_global_validate();
-               if (ret)
-                       goto undo;
- 
-               ret = sched_rt_global_constraints();
-               if (ret)
-                       goto undo;
- 
-               sched_rt_do_global();
-               sched_dl_do_global();
-       }
-       if (0) {
- undo:
-               sysctl_sched_rt_period = old_period;
-               sysctl_sched_rt_runtime = old_runtime;
-       }
-       mutex_unlock(&mutex);
- 
-       return ret;
- }
- 
- int sched_rr_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
- {
-       int ret;
-       static DEFINE_MUTEX(mutex);
- 
-       mutex_lock(&mutex);
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-       /*
-        * Make sure that internally we keep jiffies.
-        * Also, writing zero resets the timeslice to default:
-        */
-       if (!ret && write) {
-               sched_rr_timeslice =
-                       sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
-                       msecs_to_jiffies(sysctl_sched_rr_timeslice);
-       }
-       mutex_unlock(&mutex);
-       return ret;
- }
- 
- #ifdef CONFIG_CGROUP_SCHED
   
   static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
   {
diff --combined mm/filemap.c

index 742034e56100fde2772566b14e30a8afdaaebe78,926484561624eb1a1167eae6a61dd1dc14ae4e24..aea58e983a737b66257c897640b9aa62fa777bc0
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -376,38 -376,6 +376,38 @@@ int filemap_flush(struct address_space 
   }
   EXPORT_SYMBOL(filemap_flush);
   
+ +/**
+ + * filemap_range_has_page - check if a page exists in range.
+ + * @mapping:           address space within which to check
+ + * @start_byte:        offset in bytes where the range starts
+ + * @end_byte:          offset in bytes where the range ends (inclusive)
+ + *
+ + * Find at least one page in the range supplied, usually used to check if
+ + * direct writing in this range will trigger a writeback.
+ + */
+ +bool filemap_range_has_page(struct address_space *mapping,
+ +                         loff_t start_byte, loff_t end_byte)
+ +{
+ +      pgoff_t index = start_byte >> PAGE_SHIFT;
+ +      pgoff_t end = end_byte >> PAGE_SHIFT;
+ +      struct pagevec pvec;
+ +      bool ret;
+ +
+ +      if (end_byte < start_byte)
+ +              return false;
+ +
+ +      if (mapping->nrpages == 0)
+ +              return false;
+ +
+ +      pagevec_init(&pvec, 0);
+ +      if (!pagevec_lookup(&pvec, mapping, index, 1))
+ +              return false;
+ +      ret = (pvec.pages[0]->index <= end);
+ +      pagevec_release(&pvec);
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(filemap_range_has_page);
+ +
   static int __filemap_fdatawait_range(struct address_space *mapping,
                                      loff_t start_byte, loff_t end_byte)
   {
@@@ -800,10 -768,10 +800,10 @@@ struct wait_page_key 
   struct wait_page_queue {
         struct page *page;
         int bit_nr;
-       wait_queue_t wait;
+       wait_queue_entry_t wait;
   };
   
- static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
   {
         struct wait_page_key *key = arg;
         struct wait_page_queue *wait_page
@@@ -866,7 -834,7 +866,7 @@@ static inline int wait_on_page_bit_comm
                 struct page *page, int bit_nr, int state, bool lock)
   {
         struct wait_page_queue wait_page;
-       wait_queue_t *wait = &wait_page.wait;
+       wait_queue_entry_t *wait = &wait_page.wait;
         int ret = 0;
   
         init_wait(wait);
@@@ -877,9 -845,9 +877,9 @@@
         for (;;) {
                 spin_lock_irq(&q->lock);
   
-               if (likely(list_empty(&wait->task_list))) {
+               if (likely(list_empty(&wait->entry))) {
                         if (lock)
-                               __add_wait_queue_tail_exclusive(q, wait);
+                               __add_wait_queue_entry_tail_exclusive(q, wait);
                         else
                                 __add_wait_queue(q, wait);
                         SetPageWaiters(page);
@@@ -939,7 -907,7 +939,7 @@@ int wait_on_page_bit_killable(struct pa
    *
    * Add an arbitrary @waiter to the wait queue for the nominated @page.
    */
- void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
   {
         wait_queue_head_t *q = page_waitqueue(page);
         unsigned long flags;
@@@ -2070,17 -2038,10 +2070,17 @@@ generic_file_read_iter(struct kiocb *io
                 loff_t size;
   
                 size = i_size_read(inode);
- -              retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- -                                      iocb->ki_pos + count - 1);
- -              if (retval < 0)
- -                      goto out;
+ +              if (iocb->ki_flags & IOCB_NOWAIT) {
+ +                      if (filemap_range_has_page(mapping, iocb->ki_pos,
+ +                                                 iocb->ki_pos + count - 1))
+ +                              return -EAGAIN;
+ +              } else {
+ +                      retval = filemap_write_and_wait_range(mapping,
+ +                                              iocb->ki_pos,
+ +                                              iocb->ki_pos + count - 1);
+ +                      if (retval < 0)
+ +                              goto out;
+ +              }
   
                 file_accessed(file);
   
@@@ -2681,9 -2642,6 +2681,9 @@@ inline ssize_t generic_write_checks(str
   
         pos = iocb->ki_pos;
   
+ +      if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ +              return -EINVAL;
+ +
         if (limit != RLIM_INFINITY) {
                 if (iocb->ki_pos >= limit) {
                         send_sig(SIGXFSZ, current, 0);
@@@ -2752,17 -2710,9 +2752,17 @@@ generic_file_direct_write(struct kiocb 
         write_len = iov_iter_count(from);
         end = (pos + write_len - 1) >> PAGE_SHIFT;
   
- -      written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
- -      if (written)
- -              goto out;
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              /* If there are pages to writeback, return */
+ +              if (filemap_range_has_page(inode->i_mapping, pos,
+ +                                         pos + iov_iter_count(from)))
+ +                      return -EAGAIN;
+ +      } else {
+ +              written = filemap_write_and_wait_range(mapping, pos,
+ +                                                      pos + write_len - 1);
+ +              if (written)
+ +                      goto out;
+ +      }
   
         /*
          * After a write we want buffered reads to be sure to go to disk to get
diff --combined mm/shmem.c

index 391f2dcca72782051cf2dfc5b71b7af11c73f2c9,fdc413f82a9957f95f359ca3159f9bf938db4898..9100c4952698ff1ba3becabf77a6309f1dfcbb4a
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -75,7 -75,6 +75,7 @@@ static struct vfsmount *shm_mnt
   #include <uapi/linux/memfd.h>
   #include <linux/userfaultfd_k.h>
   #include <linux/rmap.h>
+ +#include <linux/uuid.h>
   
   #include <linux/uaccess.h>
   #include <asm/pgtable.h>
@@@ -1903,10 -1902,10 +1903,10 @@@ unlock
    * entry unconditionally - even if something else had already woken the
    * target.
    */
- static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
   {
         int ret = default_wake_function(wait, mode, sync, key);
-       list_del_init(&wait->task_list);
+       list_del_init(&wait->entry);
         return ret;
   }
   
@@@ -2841,7 -2840,7 +2841,7 @@@ static long shmem_fallocate(struct fil
                 spin_lock(&inode->i_lock);
                 inode->i_private = NULL;
                 wake_up_all(&shmem_falloc_waitq);
-               WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+               WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
                 spin_unlock(&inode->i_lock);
                 error = 0;
                 goto out;
@@@ -3762,7 -3761,6 +3762,7 @@@ int shmem_fill_super(struct super_bloc
   #ifdef CONFIG_TMPFS_POSIX_ACL
         sb->s_flags |= MS_POSIXACL;
   #endif
+ +      uuid_gen(&sb->s_uuid);
   
         inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
         if (!inode)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 3 Jul 2017 20:08:04 +0000 (13:08 -0700)
		1	2
arch/x86/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/kyber-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/nfs4proc.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk-mq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history