mm/page_io.c

   1 /*
   2  *  linux/mm/page_io.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95,
   7  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
   8  *  Removed race in async swapping. 14.4.1996. Bruno Haible
   9  *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
  10  *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
  11  */
  12
  13 #include <linux/mm.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/gfp.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/bio.h>
  19 #include <linux/swapops.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/writeback.h>
  22 #include <linux/frontswap.h>
  23 #include <linux/blkdev.h>
  24 #include <linux/uio.h>
  25 #include <linux/sched/task.h>
  26 #include <asm/pgtable.h>
  27
  28 static struct bio *get_swap_bio(gfp_t gfp_flags,
  29                                 struct page *page, bio_end_io_t end_io)
  30 {
  31         struct bio *bio;
  32
  33         bio = bio_alloc(gfp_flags, 1);
  34         if (bio) {
  35                 bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
  36                 bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
  37                 bio->bi_end_io = end_io;
  38
  39                 bio_add_page(bio, page, PAGE_SIZE, 0);
  40                 BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
  41         }
  42         return bio;
  43 }
  44
  45 void end_swap_bio_write(struct bio *bio)
  46 {
  47         struct page *page = bio->bi_io_vec[0].bv_page;
  48
  49         if (bio->bi_status) {
  50                 SetPageError(page);
  51                 /*
  52                  * We failed to write the page out to swap-space.
  53                  * Re-dirty the page in order to avoid it being reclaimed.
  54                  * Also print a dire warning that things will go BAD (tm)
  55                  * very quickly.
  56                  *
  57                  * Also clear PG_reclaim to avoid rotate_reclaimable_page()
  58                  */
  59                 set_page_dirty(page);
  60                 pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
  61                          imajor(bio->bi_bdev->bd_inode),
  62                          iminor(bio->bi_bdev->bd_inode),
  63                          (unsigned long long)bio->bi_iter.bi_sector);
  64                 ClearPageReclaim(page);
  65         }
  66         end_page_writeback(page);
  67         bio_put(bio);
  68 }
  69
  70 static void swap_slot_free_notify(struct page *page)
  71 {
  72         struct swap_info_struct *sis;
  73         struct gendisk *disk;
  74
  75         /*
  76          * There is no guarantee that the page is in swap cache - the software
  77          * suspend code (at least) uses end_swap_bio_read() against a non-
  78          * swapcache page.  So we must check PG_swapcache before proceeding with
  79          * this optimization.
  80          */
  81         if (unlikely(!PageSwapCache(page)))
  82                 return;
  83
  84         sis = page_swap_info(page);
  85         if (!(sis->flags & SWP_BLKDEV))
  86                 return;
  87
  88         /*
  89          * The swap subsystem performs lazy swap slot freeing,
  90          * expecting that the page will be swapped out again.
  91          * So we can avoid an unnecessary write if the page
  92          * isn't redirtied.
  93          * This is good for real swap storage because we can
  94          * reduce unnecessary I/O and enhance wear-leveling
  95          * if an SSD is used as the as swap device.
  96          * But if in-memory swap device (eg zram) is used,
  97          * this causes a duplicated copy between uncompressed
  98          * data in VM-owned memory and compressed data in
  99          * zram-owned memory.  So let's free zram-owned memory
 100          * and make the VM-owned decompressed page *dirty*,
 101          * so the page should be swapped out somewhere again if
 102          * we again wish to reclaim it.
 103          */
 104         disk = sis->bdev->bd_disk;
 105         if (disk->fops->swap_slot_free_notify) {
 106                 swp_entry_t entry;
 107                 unsigned long offset;
 108
 109                 entry.val = page_private(page);
 110                 offset = swp_offset(entry);
 111
 112                 SetPageDirty(page);
 113                 disk->fops->swap_slot_free_notify(sis->bdev,
 114                                 offset);
 115         }
 116 }
 117
 118 static void end_swap_bio_read(struct bio *bio)
 119 {
 120         struct page *page = bio->bi_io_vec[0].bv_page;
 121         struct task_struct *waiter = bio->bi_private;
 122
 123         if (bio->bi_status) {
 124                 SetPageError(page);
 125                 ClearPageUptodate(page);
 126                 pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
 127                          imajor(bio->bi_bdev->bd_inode),
 128                          iminor(bio->bi_bdev->bd_inode),
 129                          (unsigned long long)bio->bi_iter.bi_sector);
 130                 goto out;
 131         }
 132
 133         SetPageUptodate(page);
 134         swap_slot_free_notify(page);
 135 out:
 136         unlock_page(page);
 137         WRITE_ONCE(bio->bi_private, NULL);
 138         bio_put(bio);
 139         wake_up_process(waiter);
 140         put_task_struct(waiter);
 141 }
 142
 143 int generic_swapfile_activate(struct swap_info_struct *sis,
 144                                 struct file *swap_file,
 145                                 sector_t *span)
 146 {
 147         struct address_space *mapping = swap_file->f_mapping;
 148         struct inode *inode = mapping->host;
 149         unsigned blocks_per_page;
 150         unsigned long page_no;
 151         unsigned blkbits;
 152         sector_t probe_block;
 153         sector_t last_block;
 154         sector_t lowest_block = -1;
 155         sector_t highest_block = 0;
 156         int nr_extents = 0;
 157         int ret;
 158
 159         blkbits = inode->i_blkbits;
 160         blocks_per_page = PAGE_SIZE >> blkbits;
 161
 162         /*
 163          * Map all the blocks into the extent list.  This code doesn't try
 164          * to be very smart.
 165          */
 166         probe_block = 0;
 167         page_no = 0;
 168         last_block = i_size_read(inode) >> blkbits;
 169         while ((probe_block + blocks_per_page) <= last_block &&
 170                         page_no < sis->max) {
 171                 unsigned block_in_page;
 172                 sector_t first_block;
 173
 174                 cond_resched();
 175
 176                 first_block = bmap(inode, probe_block);
 177                 if (first_block == 0)
 178                         goto bad_bmap;
 179
 180                 /*
 181                  * It must be PAGE_SIZE aligned on-disk
 182                  */
 183                 if (first_block & (blocks_per_page - 1)) {
 184                         probe_block++;
 185                         goto reprobe;
 186                 }
 187
 188                 for (block_in_page = 1; block_in_page < blocks_per_page;
 189                                         block_in_page++) {
 190                         sector_t block;
 191
 192                         block = bmap(inode, probe_block + block_in_page);
 193                         if (block == 0)
 194                                 goto bad_bmap;
 195                         if (block != first_block + block_in_page) {
 196                                 /* Discontiguity */
 197                                 probe_block++;
 198                                 goto reprobe;
 199                         }
 200                 }
 201
 202                 first_block >>= (PAGE_SHIFT - blkbits);
 203                 if (page_no) {  /* exclude the header page */
 204                         if (first_block < lowest_block)
 205                                 lowest_block = first_block;
 206                         if (first_block > highest_block)
 207                                 highest_block = first_block;
 208                 }
 209
 210                 /*
 211                  * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 212                  */
 213                 ret = add_swap_extent(sis, page_no, 1, first_block);
 214                 if (ret < 0)
 215                         goto out;
 216                 nr_extents += ret;
 217                 page_no++;
 218                 probe_block += blocks_per_page;
 219 reprobe:
 220                 continue;
 221         }
 222         ret = nr_extents;
 223         *span = 1 + highest_block - lowest_block;
 224         if (page_no == 0)
 225                 page_no = 1;    /* force Empty message */
 226         sis->max = page_no;
 227         sis->pages = page_no - 1;
 228         sis->highest_bit = page_no - 1;
 229 out:
 230         return ret;
 231 bad_bmap:
 232         pr_err("swapon: swapfile has holes\n");
 233         ret = -EINVAL;
 234         goto out;
 235 }
 236
 237 /*
 238  * We may have stale swap cache pages in memory: notice
 239  * them here and get rid of the unnecessary final write.
 240  */
 241 int swap_writepage(struct page *page, struct writeback_control *wbc)
 242 {
 243         int ret = 0;
 244
 245         if (try_to_free_swap(page)) {
 246                 unlock_page(page);
 247                 goto out;
 248         }
 249         if (frontswap_store(page) == 0) {
 250                 set_page_writeback(page);
 251                 unlock_page(page);
 252                 end_page_writeback(page);
 253                 goto out;
 254         }
 255         ret = __swap_writepage(page, wbc, end_swap_bio_write);
 256 out:
 257         return ret;
 258 }
 259
 260 static sector_t swap_page_sector(struct page *page)
 261 {
 262         return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 263 }
 264
 265 int __swap_writepage(struct page *page, struct writeback_control *wbc,
 266                 bio_end_io_t end_write_func)
 267 {
 268         struct bio *bio;
 269         int ret;
 270         struct swap_info_struct *sis = page_swap_info(page);
 271
 272         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 273         if (sis->flags & SWP_FILE) {
 274                 struct kiocb kiocb;
 275                 struct file *swap_file = sis->swap_file;
 276                 struct address_space *mapping = swap_file->f_mapping;
 277                 struct bio_vec bv = {
 278                         .bv_page = page,
 279                         .bv_len  = PAGE_SIZE,
 280                         .bv_offset = 0
 281                 };
 282                 struct iov_iter from;
 283
 284                 iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
 285                 init_sync_kiocb(&kiocb, swap_file);
 286                 kiocb.ki_pos = page_file_offset(page);
 287
 288                 set_page_writeback(page);
 289                 unlock_page(page);
 290                 ret = mapping->a_ops->direct_IO(&kiocb, &from);
 291                 if (ret == PAGE_SIZE) {
 292                         count_vm_event(PSWPOUT);
 293                         ret = 0;
 294                 } else {
 295                         /*
 296                          * In the case of swap-over-nfs, this can be a
 297                          * temporary failure if the system has limited
 298                          * memory for allocating transmit buffers.
 299                          * Mark the page dirty and avoid
 300                          * rotate_reclaimable_page but rate-limit the
 301                          * messages but do not flag PageError like
 302                          * the normal direct-to-bio case as it could
 303                          * be temporary.
 304                          */
 305                         set_page_dirty(page);
 306                         ClearPageReclaim(page);
 307                         pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
 308                                            page_file_offset(page));
 309                 }
 310                 end_page_writeback(page);
 311                 return ret;
 312         }
 313
 314         ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 315         if (!ret) {
 316                 count_vm_event(PSWPOUT);
 317                 return 0;
 318         }
 319
 320         ret = 0;
 321         bio = get_swap_bio(GFP_NOIO, page, end_write_func);
 322         if (bio == NULL) {
 323                 set_page_dirty(page);
 324                 unlock_page(page);
 325                 ret = -ENOMEM;
 326                 goto out;
 327         }
 328         bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 329         count_vm_event(PSWPOUT);
 330         set_page_writeback(page);
 331         unlock_page(page);
 332         submit_bio(bio);
 333 out:
 334         return ret;
 335 }
 336
 337 int swap_readpage(struct page *page, bool do_poll)
 338 {
 339         struct bio *bio;
 340         int ret = 0;
 341         struct swap_info_struct *sis = page_swap_info(page);
 342         blk_qc_t qc;
 343         struct block_device *bdev;
 344
 345         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 346         VM_BUG_ON_PAGE(!PageLocked(page), page);
 347         VM_BUG_ON_PAGE(PageUptodate(page), page);
 348         if (frontswap_load(page) == 0) {
 349                 SetPageUptodate(page);
 350                 unlock_page(page);
 351                 goto out;
 352         }
 353
 354         if (sis->flags & SWP_FILE) {
 355                 struct file *swap_file = sis->swap_file;
 356                 struct address_space *mapping = swap_file->f_mapping;
 357
 358                 ret = mapping->a_ops->readpage(swap_file, page);
 359                 if (!ret)
 360                         count_vm_event(PSWPIN);
 361                 return ret;
 362         }
 363
 364         ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
 365         if (!ret) {
 366                 if (trylock_page(page)) {
 367                         swap_slot_free_notify(page);
 368                         unlock_page(page);
 369                 }
 370
 371                 count_vm_event(PSWPIN);
 372                 return 0;
 373         }
 374
 375         ret = 0;
 376         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
 377         if (bio == NULL) {
 378                 unlock_page(page);
 379                 ret = -ENOMEM;
 380                 goto out;
 381         }
 382         bdev = bio->bi_bdev;
 383         /*
 384          * Keep this task valid during swap readpage because the oom killer may
 385          * attempt to access it in the page fault retry time check.
 386          */
 387         get_task_struct(current);
 388         bio->bi_private = current;
 389         bio_set_op_attrs(bio, REQ_OP_READ, 0);
 390         count_vm_event(PSWPIN);
 391         bio_get(bio);
 392         qc = submit_bio(bio);
 393         while (do_poll) {
 394                 set_current_state(TASK_UNINTERRUPTIBLE);
 395                 if (!READ_ONCE(bio->bi_private))
 396                         break;
 397
 398                 if (!blk_mq_poll(bdev_get_queue(bdev), qc))
 399                         break;
 400         }
 401         __set_current_state(TASK_RUNNING);
 402         bio_put(bio);
 403
 404 out:
 405         return ret;
 406 }
 407
 408 int swap_set_page_dirty(struct page *page)
 409 {
 410         struct swap_info_struct *sis = page_swap_info(page);
 411
 412         if (sis->flags & SWP_FILE) {
 413                 struct address_space *mapping = sis->swap_file->f_mapping;
 414
 415                 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 416                 return mapping->a_ops->set_page_dirty(page);
 417         } else {
 418                 return __set_page_dirty_no_writeback(page);
 419         }
 420 }