]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - mm/swapfile.c
mm/memory-hotplug: switch locking to a percpu rwsem
[karo-tx-linux.git] / mm / swapfile.c
index 4f6cba1b66322f3500714950ef3a12eeda9e4455..6ba4aab2db0b570a241abb8935f901833c65b862 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/swapfile.h>
 #include <linux/export.h>
 #include <linux/swap_slots.h>
+#include <linux/sort.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
        }
 }
 
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER       HPAGE_PMD_NR
+#else
 #define SWAPFILE_CLUSTER       256
+#endif
 #define LATENCY_LIMIT          256
 
 static inline void cluster_set_flag(struct swap_cluster_info *info,
@@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
        schedule_work(&si->discard_work);
 }
 
+static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+       struct swap_cluster_info *ci = si->cluster_info;
+
+       cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+       cluster_list_add_tail(&si->free_clusters, ci, idx);
+}
+
 /*
  * Doing discard actually. After a cluster discard is finished, the cluster
  * will be added to free cluster list. caller should hold si->lock.
@@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
 
                spin_lock(&si->lock);
                ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
-               cluster_set_flag(ci, CLUSTER_FLAG_FREE);
-               unlock_cluster(ci);
-               cluster_list_add_tail(&si->free_clusters, info, idx);
-               ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+               __free_cluster(si, idx);
                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                                0, SWAPFILE_CLUSTER);
                unlock_cluster(ci);
@@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work)
        spin_unlock(&si->lock);
 }
 
+static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+       struct swap_cluster_info *ci = si->cluster_info;
+
+       VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+       cluster_list_del_first(&si->free_clusters, ci);
+       cluster_set_count_flag(ci + idx, 0, 0);
+}
+
+static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+       struct swap_cluster_info *ci = si->cluster_info + idx;
+
+       VM_BUG_ON(cluster_count(ci) != 0);
+       /*
+        * If the swap is discardable, prepare discard the cluster
+        * instead of free it immediately. The cluster will be freed
+        * after discard.
+        */
+       if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+           (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+               swap_cluster_schedule_discard(si, idx);
+               return;
+       }
+
+       __free_cluster(si, idx);
+}
+
 /*
  * The cluster corresponding to page_nr will be used. The cluster will be
  * removed from free cluster list and its usage counter will be increased.
@@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
 
        if (!cluster_info)
                return;
-       if (cluster_is_free(&cluster_info[idx])) {
-               VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
-               cluster_list_del_first(&p->free_clusters, cluster_info);
-               cluster_set_count_flag(&cluster_info[idx], 0, 0);
-       }
+       if (cluster_is_free(&cluster_info[idx]))
+               alloc_cluster(p, idx);
 
        VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
        cluster_set_count(&cluster_info[idx],
@@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
        cluster_set_count(&cluster_info[idx],
                cluster_count(&cluster_info[idx]) - 1);
 
-       if (cluster_count(&cluster_info[idx]) == 0) {
-               /*
-                * If the swap is discardable, prepare discard the cluster
-                * instead of free it immediately. The cluster will be freed
-                * after discard.
-                */
-               if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
-                                (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
-                       swap_cluster_schedule_discard(p, idx);
-                       return;
-               }
-
-               cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-               cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
-       }
+       if (cluster_count(&cluster_info[idx]) == 0)
+               free_cluster(p, idx);
 }
 
 /*
@@ -558,6 +580,60 @@ new_cluster:
        return found_free;
 }
 
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+                            unsigned int nr_entries)
+{
+       unsigned int end = offset + nr_entries - 1;
+
+       if (offset == si->lowest_bit)
+               si->lowest_bit += nr_entries;
+       if (end == si->highest_bit)
+               si->highest_bit -= nr_entries;
+       si->inuse_pages += nr_entries;
+       if (si->inuse_pages == si->pages) {
+               si->lowest_bit = si->max;
+               si->highest_bit = 0;
+               spin_lock(&swap_avail_lock);
+               plist_del(&si->avail_list, &swap_avail_head);
+               spin_unlock(&swap_avail_lock);
+       }
+}
+
+static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+                           unsigned int nr_entries)
+{
+       unsigned long end = offset + nr_entries - 1;
+       void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+
+       if (offset < si->lowest_bit)
+               si->lowest_bit = offset;
+       if (end > si->highest_bit) {
+               bool was_full = !si->highest_bit;
+
+               si->highest_bit = end;
+               if (was_full && (si->flags & SWP_WRITEOK)) {
+                       spin_lock(&swap_avail_lock);
+                       WARN_ON(!plist_node_empty(&si->avail_list));
+                       if (plist_node_empty(&si->avail_list))
+                               plist_add(&si->avail_list, &swap_avail_head);
+                       spin_unlock(&swap_avail_lock);
+               }
+       }
+       atomic_long_add(nr_entries, &nr_swap_pages);
+       si->inuse_pages -= nr_entries;
+       if (si->flags & SWP_BLKDEV)
+               swap_slot_free_notify =
+                       si->bdev->bd_disk->fops->swap_slot_free_notify;
+       else
+               swap_slot_free_notify = NULL;
+       while (offset <= end) {
+               frontswap_invalidate_page(si->type, offset);
+               if (swap_slot_free_notify)
+                       swap_slot_free_notify(si->bdev, offset);
+               offset++;
+       }
+}
+
 static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[])
@@ -676,18 +752,7 @@ checks:
        inc_cluster_info_page(si, si->cluster_info, offset);
        unlock_cluster(ci);
 
-       if (offset == si->lowest_bit)
-               si->lowest_bit++;
-       if (offset == si->highest_bit)
-               si->highest_bit--;
-       si->inuse_pages++;
-       if (si->inuse_pages == si->pages) {
-               si->lowest_bit = si->max;
-               si->highest_bit = 0;
-               spin_lock(&swap_avail_lock);
-               plist_del(&si->avail_list, &swap_avail_head);
-               spin_unlock(&swap_avail_lock);
-       }
+       swap_range_alloc(si, offset, 1);
        si->cluster_next = offset + 1;
        slots[n_ret++] = swp_entry(si->type, offset);
 
@@ -766,6 +831,52 @@ no_page:
        return n_ret;
 }
 
+#ifdef CONFIG_THP_SWAP
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+       unsigned long idx;
+       struct swap_cluster_info *ci;
+       unsigned long offset, i;
+       unsigned char *map;
+
+       if (cluster_list_empty(&si->free_clusters))
+               return 0;
+
+       idx = cluster_list_first(&si->free_clusters);
+       offset = idx * SWAPFILE_CLUSTER;
+       ci = lock_cluster(si, offset);
+       alloc_cluster(si, idx);
+       cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+
+       map = si->swap_map + offset;
+       for (i = 0; i < SWAPFILE_CLUSTER; i++)
+               map[i] = SWAP_HAS_CACHE;
+       unlock_cluster(ci);
+       swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+       *slot = swp_entry(si->type, offset);
+
+       return 1;
+}
+
+static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+       unsigned long offset = idx * SWAPFILE_CLUSTER;
+       struct swap_cluster_info *ci;
+
+       ci = lock_cluster(si, offset);
+       cluster_set_count_flag(ci, 0, 0);
+       free_cluster(si, idx);
+       unlock_cluster(ci);
+       swap_range_free(si, offset, SWAPFILE_CLUSTER);
+}
+#else
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+       VM_WARN_ON_ONCE(1);
+       return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 static unsigned long scan_swap_map(struct swap_info_struct *si,
                                   unsigned char usage)
 {
@@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 {
+       unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
 
-       avail_pgs = atomic_long_read(&nr_swap_pages);
+       /* Only single cluster request supported */
+       WARN_ON_ONCE(n_goal > 1 && cluster);
+
+       avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
        if (avail_pgs <= 0)
                goto noswap;
 
@@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
        if (n_goal > avail_pgs)
                n_goal = avail_pgs;
 
-       atomic_long_sub(n_goal, &nr_swap_pages);
+       atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
 
        spin_lock(&swap_avail_lock);
 
@@ -823,10 +938,13 @@ start_over:
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-               n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
-                                           n_goal, swp_entries);
+               if (cluster)
+                       n_ret = swap_alloc_cluster(si, swp_entries);
+               else
+                       n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+                                                   n_goal, swp_entries);
                spin_unlock(&si->lock);
-               if (n_ret)
+               if (n_ret || cluster)
                        goto check_out;
                pr_debug("scan_swap_map of si %d failed to find offset\n",
                        si->type);
@@ -852,7 +970,8 @@ nextsi:
 
 check_out:
        if (n_ret < n_goal)
-               atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
+               atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+                               &nr_swap_pages);
 noswap:
        return n_ret;
 }
@@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
        dec_cluster_info_page(p, p->cluster_info, offset);
        unlock_cluster(ci);
 
-       mem_cgroup_uncharge_swap(entry);
-       if (offset < p->lowest_bit)
-               p->lowest_bit = offset;
-       if (offset > p->highest_bit) {
-               bool was_full = !p->highest_bit;
-
-               p->highest_bit = offset;
-               if (was_full && (p->flags & SWP_WRITEOK)) {
-                       spin_lock(&swap_avail_lock);
-                       WARN_ON(!plist_node_empty(&p->avail_list));
-                       if (plist_node_empty(&p->avail_list))
-                               plist_add(&p->avail_list,
-                                         &swap_avail_head);
-                       spin_unlock(&swap_avail_lock);
-               }
-       }
-       atomic_long_inc(&nr_swap_pages);
-       p->inuse_pages--;
-       frontswap_invalidate_page(p->type, offset);
-       if (p->flags & SWP_BLKDEV) {
-               struct gendisk *disk = p->bdev->bd_disk;
-
-               if (disk->fops->swap_slot_free_notify)
-                       disk->fops->swap_slot_free_notify(p->bdev,
-                                                         offset);
-       }
+       mem_cgroup_uncharge_swap(entry, 1);
+       swap_range_free(p, offset, 1);
 }
 
 /*
@@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-void swapcache_free(swp_entry_t entry)
+static void swapcache_free(swp_entry_t entry)
 {
        struct swap_info_struct *p;
 
@@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry)
        }
 }
 
+#ifdef CONFIG_THP_SWAP
+static void swapcache_free_cluster(swp_entry_t entry)
+{
+       unsigned long offset = swp_offset(entry);
+       unsigned long idx = offset / SWAPFILE_CLUSTER;
+       struct swap_cluster_info *ci;
+       struct swap_info_struct *si;
+       unsigned char *map;
+       unsigned int i;
+
+       si = swap_info_get(entry);
+       if (!si)
+               return;
+
+       ci = lock_cluster(si, offset);
+       map = si->swap_map + offset;
+       for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+               VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
+               map[i] = 0;
+       }
+       unlock_cluster(ci);
+       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+       swap_free_cluster(si, idx);
+       spin_unlock(&si->lock);
+}
+#else
+static inline void swapcache_free_cluster(swp_entry_t entry)
+{
+}
+#endif /* CONFIG_THP_SWAP */
+
+void put_swap_page(struct page *page, swp_entry_t entry)
+{
+       if (!PageTransHuge(page))
+               swapcache_free(entry);
+       else
+               swapcache_free_cluster(entry);
+}
+
+static int swp_entry_cmp(const void *ent1, const void *ent2)
+{
+       const swp_entry_t *e1 = ent1, *e2 = ent2;
+
+       return (int)swp_type(*e1) - (int)swp_type(*e2);
+}
+
 void swapcache_free_entries(swp_entry_t *entries, int n)
 {
        struct swap_info_struct *p, *prev;
@@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
 
        prev = NULL;
        p = NULL;
+
+       /*
+        * Sort swap entries by swap device, so each lock is only taken once.
+        * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
+        * so low that it isn't necessary to optimize further.
+        */
+       if (nr_swapfiles > 1)
+               sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
        for (i = 0; i < n; ++i) {
                p = swap_info_get_cont(entries[i], prev);
                if (p)
@@ -1719,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
                page = read_swap_cache_async(entry,
-                                       GFP_HIGHUSER_MOVABLE, NULL, 0);
+                                       GFP_HIGHUSER_MOVABLE, NULL, 0, false);
                if (!page) {
                        /*
                         * Either swap_duplicate() failed because entry