swap: make swap discard async

author Shaohua Li <shli@kernel.org>

Wed, 11 Sep 2013 21:20:30 +0000 (14:20 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 11 Sep 2013 22:57:15 +0000 (15:57 -0700)
author Shaohua Li <shli@kernel.org>
Wed, 11 Sep 2013 21:20:30 +0000 (14:20 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:57:15 +0000 (15:57 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index cb5baebf31d65cf0b8d7c2c3282de0b2403a326e..8a3c4a1caa14f501bac7d812f69ca4cc359ed0f7 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -217,8 +217,6 @@ struct swap_info_struct {
         unsigned int inuse_pages;       /* number of those currently in use */
         unsigned int cluster_next;      /* likely index for next allocation */
         unsigned int cluster_nr;        /* countdown to next cluster search */
-       unsigned int lowest_alloc;      /* while preparing discard cluster */
-       unsigned int highest_alloc;     /* while preparing discard cluster */
         struct swap_extent *curr_swap_extent;
         struct swap_extent first_swap_extent;
         struct block_device *bdev;      /* swap device or bdev of swap file */
@@ -232,14 +230,18 @@ struct swap_info_struct {
                                          * protect map scan related fields like
                                          * swap_map, lowest_bit, highest_bit,
                                          * inuse_pages, cluster_next,
-                                        * cluster_nr, lowest_alloc and
-                                        * highest_alloc. other fields are only
-                                        * changed at swapon/swapoff, so are
-                                        * protected by swap_lock. changing
-                                        * flags need hold this lock and
-                                        * swap_lock. If both locks need hold,
-                                        * hold swap_lock first.
+                                        * cluster_nr, lowest_alloc,
+                                        * highest_alloc, free/discard cluster
+                                        * list. other fields are only changed
+                                        * at swapon/swapoff, so are protected
+                                        * by swap_lock. changing flags need
+                                        * hold this lock and swap_lock. If
+                                        * both locks need hold, hold swap_lock
+                                        * first.
                                          */
+       struct work_struct discard_work; /* discard worker */
+       struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
+       struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
  };
  
  struct swap_list_t {
diff --git a/mm/swapfile.c b/mm/swapfile.c

index d1fbeb486de52ed31a02bdd957c985fcde552c73..dac47c66055c47f5c359bf545af69233b452b089 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
         }
  }
  
-static int wait_for_discard(void *word)
-{
-       schedule();
-       return 0;
-}
-
  #define SWAPFILE_CLUSTER       256
  #define LATENCY_LIMIT          256
  
@@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
         info->data = 0;
  }
  
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+               unsigned int idx)
+{
+       /*
+        * If scan_swap_map() can't find a free cluster, it will check
+        * si->swap_map directly. To make sure the discarding cluster isn't
+        * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+        * will be cleared after discard
+        */
+       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+       if (cluster_is_null(&si->discard_cluster_head)) {
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               idx, 0);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       } else {
+               unsigned int tail = cluster_next(&si->discard_cluster_tail);
+               cluster_set_next(&si->cluster_info[tail], idx);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       }
+
+       schedule_work(&si->discard_work);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+       struct swap_cluster_info *info;
+       unsigned int idx;
+
+       info = si->cluster_info;
+
+       while (!cluster_is_null(&si->discard_cluster_head)) {
+               idx = cluster_next(&si->discard_cluster_head);
+
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               cluster_next(&info[idx]), 0);
+               if (cluster_next(&si->discard_cluster_tail) == idx) {
+                       cluster_set_null(&si->discard_cluster_head);
+                       cluster_set_null(&si->discard_cluster_tail);
+               }
+               spin_unlock(&si->lock);
+
+               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+                               SWAPFILE_CLUSTER);
+
+               spin_lock(&si->lock);
+               cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+               if (cluster_is_null(&si->free_cluster_head)) {
+                       cluster_set_next_flag(&si->free_cluster_head,
+                                               idx, 0);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               } else {
+                       unsigned int tail;
+
+                       tail = cluster_next(&si->free_cluster_tail);
+                       cluster_set_next(&info[tail], idx);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               }
+               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                               0, SWAPFILE_CLUSTER);
+       }
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+       struct swap_info_struct *si;
+
+       si = container_of(work, struct swap_info_struct, discard_work);
+
+       spin_lock(&si->lock);
+       swap_do_scheduled_discard(si);
+       spin_unlock(&si->lock);
+}
+
  /*
   * The cluster corresponding to page_nr will be used. The cluster will be
   * removed from free cluster list and its usage counter will be increased.
@@ -287,6 +365,16 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
                 cluster_count(&cluster_info[idx]) - 1);
  
         if (cluster_count(&cluster_info[idx]) == 0) {
+               /*
+                * If the swap is discardable, prepare discard the cluster
+                * instead of free it immediately. The cluster will be freed
+                * after discard.
+                */
+               if (p->flags & SWP_PAGE_DISCARD) {
+                       swap_cluster_schedule_discard(p, idx);
+                       return;
+               }
+
                 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
                 if (cluster_is_null(&p->free_cluster_head)) {
                         cluster_set_next_flag(&p->free_cluster_head, idx, 0);
@@ -319,7 +407,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
         unsigned long scan_base;
         unsigned long last_in_cluster = 0;
         int latency_ration = LATENCY_LIMIT;
-       int found_free_cluster = 0;
  
         /*
          * We try to cluster swap pages by allocating them sequentially
@@ -340,19 +427,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                         si->cluster_nr = SWAPFILE_CLUSTER - 1;
                         goto checks;
                 }
-               if (si->flags & SWP_PAGE_DISCARD) {
-                       /*
-                        * Start range check on racing allocations, in case
-                        * they overlap the cluster we eventually decide on
-                        * (we scan without swap_lock to allow preemption).
-                        * It's hardly conceivable that cluster_nr could be
-                        * wrapped during our scan, but don't depend on it.
-                        */
-                       if (si->lowest_alloc)
-                               goto checks;
-                       si->lowest_alloc = si->max;
-                       si->highest_alloc = 0;
-               }
  check_cluster:
                 if (!cluster_is_null(&si->free_cluster_head)) {
                         offset = cluster_next(&si->free_cluster_head) *
@@ -360,15 +434,27 @@ check_cluster:
                         last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
                         si->cluster_next = offset;
                         si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                       found_free_cluster = 1;
                         goto checks;
                 } else if (si->cluster_info) {
+                       /*
+                        * we don't have free cluster but have some clusters in
+                        * discarding, do discard now and reclaim them
+                        */
+                       if (!cluster_is_null(&si->discard_cluster_head)) {
+                               si->cluster_nr = 0;
+                               swap_do_scheduled_discard(si);
+                               scan_base = offset = si->cluster_next;
+                               if (!si->cluster_nr)
+                                       goto check_cluster;
+                               si->cluster_nr--;
+                               goto checks;
+                       }
+
                         /*
                          * Checking free cluster is fast enough, we can do the
                          * check every time
                          */
                         si->cluster_nr = 0;
-                       si->lowest_alloc = 0;
                         goto checks;
                 }
  
@@ -395,7 +481,6 @@ check_cluster:
                                 offset -= SWAPFILE_CLUSTER - 1;
                                 si->cluster_next = offset;
                                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                 goto checks;
                         }
                         if (unlikely(--latency_ration < 0)) {
@@ -416,7 +501,6 @@ check_cluster:
                                 offset -= SWAPFILE_CLUSTER - 1;
                                 si->cluster_next = offset;
                                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                 goto checks;
                         }
                         if (unlikely(--latency_ration < 0)) {
@@ -428,7 +512,6 @@ check_cluster:
                 offset = scan_base;
                 spin_lock(&si->lock);
                 si->cluster_nr = SWAPFILE_CLUSTER - 1;
-               si->lowest_alloc = 0;
         }
  
  checks:
@@ -470,59 +553,6 @@ checks:
         si->cluster_next = offset + 1;
         si->flags -= SWP_SCANNING;
  
-       if (si->lowest_alloc) {
-               /*
-                * Only set when SWP_PAGE_DISCARD, and there's a scan
-                * for a free cluster in progress or just completed.
-                */
-               if (found_free_cluster) {
-                       /*
-                        * To optimize wear-levelling, discard the
-                        * old data of the cluster, taking care not to
-                        * discard any of its pages that have already
-                        * been allocated by racing tasks (offset has
-                        * already stepped over any at the beginning).
-                        */
-                       if (offset < si->highest_alloc &&
-                           si->lowest_alloc <= last_in_cluster)
-                               last_in_cluster = si->lowest_alloc - 1;
-                       si->flags |= SWP_DISCARDING;
-                       spin_unlock(&si->lock);
-
-                       if (offset < last_in_cluster)
-                               discard_swap_cluster(si, offset,
-                                       last_in_cluster - offset + 1);
-
-                       spin_lock(&si->lock);
-                       si->lowest_alloc = 0;
-                       si->flags &= ~SWP_DISCARDING;
-
-                       smp_mb();       /* wake_up_bit advises this */
-                       wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-
-               } else if (si->flags & SWP_DISCARDING) {
-                       /*
-                        * Delay using pages allocated by racing tasks
-                        * until the whole discard has been issued. We
-                        * could defer that delay until swap_writepage,
-                        * but it's easier to keep this self-contained.
-                        */
-                       spin_unlock(&si->lock);
-                       wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-                               wait_for_discard, TASK_UNINTERRUPTIBLE);
-                       spin_lock(&si->lock);
-               } else {
-                       /*
-                        * Note pages allocated by racing tasks while
-                        * scan for a free cluster is in progress, so
-                        * that its final discard can exclude them.
-                        */
-                       if (offset < si->lowest_alloc)
-                               si->lowest_alloc = offset;
-                       if (offset > si->highest_alloc)
-                               si->highest_alloc = offset;
-               }
-       }
         return offset;
  
  scan:
@@ -1806,6 +1836,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 goto out_dput;
         }
  
+       flush_work(&p->discard_work);
+
         destroy_swap_extents(p);
         if (p->flags & SWP_CONTINUED)
                 free_swap_count_continuations(p);
@@ -2172,6 +2204,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
  
         cluster_set_null(&p->free_cluster_head);
         cluster_set_null(&p->free_cluster_tail);
+       cluster_set_null(&p->discard_cluster_head);
+       cluster_set_null(&p->discard_cluster_tail);
  
         for (i = 0; i < swap_header->info.nr_badpages; i++) {
                 unsigned int page_nr = swap_header->info.badpages[i];
@@ -2281,6 +2315,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         if (IS_ERR(p))
                 return PTR_ERR(p);
  
+       INIT_WORK(&p->discard_work, swap_discard_work);
+
         name = getname(specialfile);
         if (IS_ERR(name)) {
                 error = PTR_ERR(name);
author	Shaohua Li <shli@kernel.org>
	Wed, 11 Sep 2013 21:20:30 +0000 (14:20 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 11 Sep 2013 22:57:15 +0000 (15:57 -0700)
include/linux/swap.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history