2 * Compressed RAM based swap device
4 * Copyright (C) 2008, 2009, 2010 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
12 * Project home: http://compcache.googlecode.com
15 #define KMSG_COMPONENT "ramzswap"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/genhd.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/lzo.h>
28 #include <linux/string.h>
29 #include <linux/swap.h>
30 #include <linux/swapops.h>
31 #include <linux/vmalloc.h>
33 #include "ramzswap_drv.h"
36 static int ramzswap_major;
37 static struct ramzswap *devices;
40 * Pages that compress to larger than this size are
41 * forwarded to backing swap, if present or stored
42 * uncompressed in memory otherwise.
44 static unsigned int max_zpage_size;
46 /* Module params (documentation at end) */
47 static unsigned int num_devices;
49 static int rzs_test_flag(struct ramzswap *rzs, u32 index,
50 enum rzs_pageflags flag)
52 return rzs->table[index].flags & BIT(flag);
55 static void rzs_set_flag(struct ramzswap *rzs, u32 index,
56 enum rzs_pageflags flag)
58 rzs->table[index].flags |= BIT(flag);
61 static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
62 enum rzs_pageflags flag)
64 rzs->table[index].flags &= ~BIT(flag);
67 static int page_zero_filled(void *ptr)
72 page = (unsigned long *)ptr;
74 for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
83 * memlimit cannot be greater than backing disk size.
85 static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
87 int memlimit_valid = 1;
90 pr_info("Memory limit not set.\n");
94 if (rzs->memlimit > rzs->disksize) {
95 pr_info("Memory limit cannot be greater than "
96 "disksize: limit=%zu, disksize=%zu\n",
97 rzs->memlimit, rzs->disksize);
101 if (!memlimit_valid) {
102 size_t mempart, disksize;
103 pr_info("Using default: smaller of (%u%% of RAM) and "
104 "(backing disk size).\n",
105 default_memlimit_perc_ram);
106 mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
107 disksize = rzs->disksize;
108 rzs->memlimit = mempart > disksize ? disksize : mempart;
111 if (rzs->memlimit > totalram_bytes / 2) {
113 "Its not advisable setting limit more than half of "
114 "size of memory since we expect a 2:1 compression ratio. "
115 "Limit represents amount of *compressed* data we can keep "
117 "\tMemory Size: %zu kB\n"
118 "\tLimit you selected: %zu kB\n"
119 "Continuing anyway ...\n",
120 totalram_bytes >> 10, rzs->memlimit >> 10
124 rzs->memlimit &= PAGE_MASK;
125 BUG_ON(!rzs->memlimit);
128 static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
130 if (!rzs->disksize) {
132 "disk size not provided. You can use disksize_kb module "
133 "param to specify size.\nUsing default: (%u%% of RAM).\n",
134 default_disksize_perc_ram
136 rzs->disksize = default_disksize_perc_ram *
137 (totalram_bytes / 100);
140 if (rzs->disksize > 2 * (totalram_bytes)) {
142 "There is little point creating a ramzswap of greater than "
143 "twice the size of memory since we expect a 2:1 compression "
144 "ratio. Note that ramzswap uses about 0.1%% of the size of "
145 "the swap device when not in use so a huge ramzswap is "
147 "\tMemory Size: %zu kB\n"
148 "\tSize you selected: %zu kB\n"
149 "Continuing anyway ...\n",
150 totalram_bytes >> 10, rzs->disksize
154 rzs->disksize &= PAGE_MASK;
158 * Swap header (1st page of swap device) contains information
159 * to indentify it as a swap partition. Prepare such a header
160 * for ramzswap device (ramzswap0) so that swapon can identify
161 * it as swap partition. In case backing swap device is provided,
162 * copy its swap header.
164 static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
168 struct address_space *mapping;
169 union swap_header *backing_swap_header;
172 * There is no backing swap device. Create a swap header
173 * that is acceptable by swapon.
175 if (!rzs->backing_swap) {
177 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
178 s->info.nr_badpages = 0;
179 memcpy(s->magic.magic, "SWAPSPACE2", 10);
184 * We have a backing swap device. Copy its swap header
185 * to ramzswap device header. If this header contains
186 * invalid information (backing device not a swap
187 * partition, etc.), swapon will fail for ramzswap
188 * which is correct behavior - we don't want to swap
189 * over filesystem partition!
192 /* Read the backing swap header (code from sys_swapon) */
193 mapping = rzs->swap_file->f_mapping;
194 if (!mapping->a_ops->readpage) {
199 page = read_mapping_page(mapping, 0, rzs->swap_file);
205 backing_swap_header = kmap(page);
206 memcpy(s, backing_swap_header, sizeof(*s));
207 if (s->info.nr_badpages) {
208 pr_info("Cannot use backing swap with bad pages (%u)\n",
209 s->info.nr_badpages);
213 * ramzswap disksize equals number of usable pages in backing
214 * swap. Set last_page in swap header to match this disksize
215 * ('last_page' means 0-based index of last usable swap page).
217 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
224 static void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
225 struct ramzswap_ioctl_stats *s)
227 strncpy(s->backing_swap_name, rzs->backing_swap_name,
228 MAX_SWAP_NAME_LEN - 1);
229 s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
231 s->disksize = rzs->disksize;
232 s->memlimit = rzs->memlimit;
234 #if defined(CONFIG_RAMZSWAP_STATS)
236 struct ramzswap_stats *rs = &rzs->stats;
237 size_t succ_writes, mem_used;
238 unsigned int good_compress_perc = 0, no_compress_perc = 0;
240 mem_used = xv_get_total_size_bytes(rzs->mem_pool)
241 + (rs->pages_expand << PAGE_SHIFT);
242 succ_writes = rzs_stat64_read(rzs, &rs->num_writes) -
243 rzs_stat64_read(rzs, &rs->failed_writes);
245 if (succ_writes && rs->pages_stored) {
246 good_compress_perc = rs->good_compress * 100
248 no_compress_perc = rs->pages_expand * 100
252 s->num_reads = rzs_stat64_read(rzs, &rs->num_reads);
253 s->num_writes = rzs_stat64_read(rzs, &rs->num_writes);
254 s->failed_reads = rzs_stat64_read(rzs, &rs->failed_reads);
255 s->failed_writes = rzs_stat64_read(rzs, &rs->failed_writes);
256 s->invalid_io = rzs_stat64_read(rzs, &rs->invalid_io);
257 s->notify_free = rzs_stat64_read(rzs, &rs->notify_free);
258 s->pages_zero = rs->pages_zero;
260 s->good_compress_pct = good_compress_perc;
261 s->pages_expand_pct = no_compress_perc;
263 s->pages_stored = rs->pages_stored;
264 s->pages_used = mem_used >> PAGE_SHIFT;
265 s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
266 s->compr_data_size = rs->compr_size;
267 s->mem_used_total = mem_used;
269 s->bdev_num_reads = rzs_stat64_read(rzs, &rs->bdev_num_reads);
270 s->bdev_num_writes = rzs_stat64_read(rzs, &rs->bdev_num_writes);
272 #endif /* CONFIG_RAMZSWAP_STATS */
275 static int add_backing_swap_extent(struct ramzswap *rzs,
280 struct list_head *head;
281 struct page *curr_page, *new_page;
282 unsigned int extents_per_page = PAGE_SIZE /
283 sizeof(struct ramzswap_backing_extent);
285 idx = rzs->num_extents % extents_per_page;
287 new_page = alloc_page(__GFP_ZERO);
291 if (rzs->num_extents) {
292 curr_page = virt_to_page(rzs->curr_extent);
293 head = &curr_page->lru;
295 head = &rzs->backing_swap_extent_list;
298 list_add(&new_page->lru, head);
299 rzs->curr_extent = page_address(new_page);
302 rzs->curr_extent->phy_pagenum = phy_pagenum;
303 rzs->curr_extent->num_pages = num_pages;
305 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
306 "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
307 phy_pagenum + num_pages - 1, rzs->curr_extent);
309 if (idx != extents_per_page - 1)
315 static int setup_backing_swap_extents(struct ramzswap *rzs,
316 struct inode *inode, unsigned long *num_pages)
320 unsigned blocks_per_page;
321 pgoff_t contig_pages = 0, total_pages = 0;
322 pgoff_t pagenum = 0, prev_pagenum = 0;
323 sector_t probe_block = 0;
326 blkbits = inode->i_blkbits;
327 blocks_per_page = PAGE_SIZE >> blkbits;
329 last_block = i_size_read(inode) >> blkbits;
330 while (probe_block + blocks_per_page <= last_block) {
331 unsigned block_in_page;
332 sector_t first_block;
334 first_block = bmap(inode, probe_block);
335 if (first_block == 0)
338 /* It must be PAGE_SIZE aligned on-disk */
339 if (first_block & (blocks_per_page - 1)) {
344 /* All blocks within this page must be contiguous on disk */
345 for (block_in_page = 1; block_in_page < blocks_per_page;
349 block = bmap(inode, probe_block + block_in_page);
352 if (block != first_block + block_in_page) {
360 * We found a PAGE_SIZE length, PAGE_SIZE aligned
363 pagenum = first_block >> (PAGE_SHIFT - blkbits);
365 if (total_pages && (pagenum != prev_pagenum + 1)) {
366 ret = add_backing_swap_extent(rzs, prev_pagenum -
367 (contig_pages - 1), contig_pages);
375 prev_pagenum = pagenum;
376 probe_block += blocks_per_page;
383 pr_debug("adding last extent: pagenum=%lu, "
384 "contig_pages=%lu\n", pagenum, contig_pages);
385 ret = add_backing_swap_extent(rzs,
386 prev_pagenum - (contig_pages - 1), contig_pages);
391 if (!rzs->num_extents) {
392 pr_err("No swap extents found!\n");
397 *num_pages = total_pages;
398 pr_info("Found %lu extents containing %luk\n",
399 rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
404 pr_err("Backing swapfile has holes\n");
407 while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
409 struct list_head *entry = rzs->backing_swap_extent_list.next;
410 page = list_entry(entry, struct page, lru);
417 static void map_backing_swap_extents(struct ramzswap *rzs)
419 struct ramzswap_backing_extent *se;
420 struct page *table_page, *se_page;
421 unsigned long num_pages, num_table_pages, entry;
422 unsigned long se_idx, span;
423 unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
424 unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
426 /* True for block device */
427 if (!rzs->num_extents)
430 se_page = list_entry(rzs->backing_swap_extent_list.next,
432 se = page_address(se_page);
433 span = se->num_pages;
434 num_pages = rzs->disksize >> PAGE_SHIFT;
435 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
440 while (num_table_pages--) {
441 table_page = vmalloc_to_page(&rzs->table[entry]);
442 while (span <= entry) {
444 if (se_idx == rzs->num_extents)
447 if (!(se_idx % extents_per_page)) {
448 se_page = list_entry(se_page->lru.next,
450 se = page_address(se_page);
454 span += se->num_pages;
456 table_page->mapping = (struct address_space *)se;
457 table_page->private = se->num_pages - (span - entry);
458 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
459 entry, span, table_page->mapping, table_page->private);
460 entry += entries_per_page;
465 * Check if value of backing_swap module param is sane.
466 * Claim this device and set ramzswap size equal to
467 * size of this block device.
469 static int setup_backing_swap(struct ramzswap *rzs)
473 unsigned long num_pages = 0;
475 struct file *swap_file;
476 struct address_space *mapping;
477 struct block_device *bdev = NULL;
479 if (!rzs->backing_swap_name[0]) {
480 pr_debug("backing_swap param not given\n");
484 pr_info("Using backing swap device: %s\n", rzs->backing_swap_name);
486 swap_file = filp_open(rzs->backing_swap_name,
487 O_RDWR | O_LARGEFILE, 0);
488 if (IS_ERR(swap_file)) {
489 pr_err("Error opening backing device: %s\n",
490 rzs->backing_swap_name);
495 mapping = swap_file->f_mapping;
496 inode = mapping->host;
498 if (S_ISBLK(inode->i_mode)) {
499 bdev = I_BDEV(inode);
500 ret = bd_claim(bdev, setup_backing_swap);
505 disksize = i_size_read(inode);
507 * Can happen if user gives an extended partition as
508 * backing swap or simply a bad disk.
511 pr_err("Error reading backing swap size.\n");
514 } else if (S_ISREG(inode->i_mode)) {
515 bdev = inode->i_sb->s_bdev;
516 if (IS_SWAPFILE(inode)) {
520 ret = setup_backing_swap_extents(rzs, inode, &num_pages);
523 disksize = num_pages << PAGE_SHIFT;
528 rzs->swap_file = swap_file;
529 rzs->backing_swap = bdev;
530 rzs->disksize = disksize;
537 filp_close(swap_file, NULL);
540 rzs->backing_swap = NULL;
545 * Map logical page number 'pagenum' to physical page number
546 * on backing swap device. For block device, this is a nop.
548 static u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
550 u32 skip_pages, entries_per_page;
551 size_t delta, se_offset, skipped;
552 struct page *table_page, *se_page;
553 struct ramzswap_backing_extent *se;
555 if (!rzs->num_extents)
558 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
560 table_page = vmalloc_to_page(&rzs->table[pagenum]);
561 se = (struct ramzswap_backing_extent *)table_page->mapping;
562 se_page = virt_to_page(se);
564 skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
565 se_offset = table_page->private + skip_pages;
567 if (se_offset < se->num_pages)
568 return se->phy_pagenum + se_offset;
570 skipped = se->num_pages - table_page->private;
572 struct ramzswap_backing_extent *se_base;
573 u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
575 /* Get next swap extent */
576 se_base = (struct ramzswap_backing_extent *)
577 page_address(se_page);
578 if (se - se_base == se_entries_per_page - 1) {
579 se_page = list_entry(se_page->lru.next,
581 se = page_address(se_page);
586 skipped += se->num_pages;
587 } while (skipped < skip_pages);
589 delta = skipped - skip_pages;
590 se_offset = se->num_pages - delta;
592 return se->phy_pagenum + se_offset;
595 static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
600 struct page *page = rzs->table[index].page;
601 u32 offset = rzs->table[index].offset;
603 if (unlikely(!page)) {
605 * No memory is allocated for zero filled pages.
606 * Simply clear zero page flag.
608 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
609 rzs_clear_flag(rzs, index, RZS_ZERO);
610 rzs_stat_dec(&rzs->stats.pages_zero);
615 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
618 rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
619 rzs_stat_dec(&rzs->stats.pages_expand);
623 obj = kmap_atomic(page, KM_USER0) + offset;
624 clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
625 kunmap_atomic(obj, KM_USER0);
627 xv_free(rzs->mem_pool, page, offset);
628 if (clen <= PAGE_SIZE / 2)
629 rzs_stat_dec(&rzs->stats.good_compress);
632 rzs->stats.compr_size -= clen;
633 rzs_stat_dec(&rzs->stats.pages_stored);
635 rzs->table[index].page = NULL;
636 rzs->table[index].offset = 0;
639 static int handle_zero_page(struct bio *bio)
642 struct page *page = bio->bi_io_vec[0].bv_page;
644 user_mem = kmap_atomic(page, KM_USER0);
645 memset(user_mem, 0, PAGE_SIZE);
646 kunmap_atomic(user_mem, KM_USER0);
648 flush_dcache_page(page);
650 set_bit(BIO_UPTODATE, &bio->bi_flags);
655 static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
659 unsigned char *user_mem, *cmem;
661 page = bio->bi_io_vec[0].bv_page;
662 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
664 user_mem = kmap_atomic(page, KM_USER0);
665 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
666 rzs->table[index].offset;
668 memcpy(user_mem, cmem, PAGE_SIZE);
669 kunmap_atomic(user_mem, KM_USER0);
670 kunmap_atomic(cmem, KM_USER1);
672 flush_dcache_page(page);
674 set_bit(BIO_UPTODATE, &bio->bi_flags);
680 * Called when request page is not present in ramzswap.
681 * Its either in backing swap device (if present) or
682 * this is an attempt to read before any previous write
683 * to this location - this happens due to readahead when
684 * swap device is read from user-space (e.g. during swapon)
686 static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
689 * Always forward such requests to backing swap
690 * device (if present)
692 if (rzs->backing_swap) {
694 rzs_stat64_dec(rzs, &rzs->stats.num_reads);
695 rzs_stat64_inc(rzs, &rzs->stats.bdev_num_reads);
696 bio->bi_bdev = rzs->backing_swap;
699 * In case backing swap is a file, find the right offset within
700 * the file corresponding to logical position 'index'. For block
701 * device, this is a nop.
703 pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
704 bio->bi_sector = map_backing_swap_page(rzs, pagenum)
705 << SECTORS_PER_PAGE_SHIFT;
710 * Its unlikely event in case backing dev is
713 pr_debug("Read before write on swap device: "
714 "sector=%lu, size=%u, offset=%u\n",
715 (ulong)(bio->bi_sector), bio->bi_size,
716 bio->bi_io_vec[0].bv_offset);
718 /* Do nothing. Just return success */
719 set_bit(BIO_UPTODATE, &bio->bi_flags);
724 static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
730 struct zobj_header *zheader;
731 unsigned char *user_mem, *cmem;
733 rzs_stat64_inc(rzs, &rzs->stats.num_reads);
735 page = bio->bi_io_vec[0].bv_page;
736 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
738 if (rzs_test_flag(rzs, index, RZS_ZERO))
739 return handle_zero_page(bio);
741 /* Requested page is not present in compressed area */
742 if (!rzs->table[index].page)
743 return handle_ramzswap_fault(rzs, bio);
745 /* Page is stored uncompressed since it's incompressible */
746 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
747 return handle_uncompressed_page(rzs, bio);
749 user_mem = kmap_atomic(page, KM_USER0);
752 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
753 rzs->table[index].offset;
755 ret = lzo1x_decompress_safe(
756 cmem + sizeof(*zheader),
757 xv_get_object_size(cmem) - sizeof(*zheader),
760 kunmap_atomic(user_mem, KM_USER0);
761 kunmap_atomic(cmem, KM_USER1);
763 /* should NEVER happen */
764 if (unlikely(ret != LZO_E_OK)) {
765 pr_err("Decompression failed! err=%d, page=%u\n",
767 rzs_stat64_inc(rzs, &rzs->stats.failed_reads);
771 flush_dcache_page(page);
773 set_bit(BIO_UPTODATE, &bio->bi_flags);
782 static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
784 int ret, fwd_write_request = 0;
787 struct zobj_header *zheader;
788 struct page *page, *page_store;
789 unsigned char *user_mem, *cmem, *src;
791 rzs_stat64_inc(rzs, &rzs->stats.num_writes);
793 page = bio->bi_io_vec[0].bv_page;
794 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
796 src = rzs->compress_buffer;
799 * System swaps to same sector again when the stored page
800 * is no longer referenced by any process. So, its now safe
801 * to free the memory that was allocated for this page.
803 if (rzs->table[index].page || rzs_test_flag(rzs, index, RZS_ZERO))
804 ramzswap_free_page(rzs, index);
806 mutex_lock(&rzs->lock);
808 user_mem = kmap_atomic(page, KM_USER0);
809 if (page_zero_filled(user_mem)) {
810 kunmap_atomic(user_mem, KM_USER0);
811 mutex_unlock(&rzs->lock);
812 rzs_stat_inc(&rzs->stats.pages_zero);
813 rzs_set_flag(rzs, index, RZS_ZERO);
815 set_bit(BIO_UPTODATE, &bio->bi_flags);
820 if (rzs->backing_swap &&
821 (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
822 kunmap_atomic(user_mem, KM_USER0);
823 mutex_unlock(&rzs->lock);
824 fwd_write_request = 1;
828 ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
829 rzs->compress_workmem);
831 kunmap_atomic(user_mem, KM_USER0);
833 if (unlikely(ret != LZO_E_OK)) {
834 mutex_unlock(&rzs->lock);
835 pr_err("Compression failed! err=%d\n", ret);
836 rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
841 * Page is incompressible. Forward it to backing swap
842 * if present. Otherwise, store it as-is (uncompressed)
843 * since we do not want to return too many swap write
844 * errors which has side effect of hanging the system.
846 if (unlikely(clen > max_zpage_size)) {
847 if (rzs->backing_swap) {
848 mutex_unlock(&rzs->lock);
849 fwd_write_request = 1;
854 page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
855 if (unlikely(!page_store)) {
856 mutex_unlock(&rzs->lock);
857 pr_info("Error allocating memory for incompressible "
858 "page: %u\n", index);
859 rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
864 rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
865 rzs_stat_inc(&rzs->stats.pages_expand);
866 rzs->table[index].page = page_store;
867 src = kmap_atomic(page, KM_USER0);
871 if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
872 &rzs->table[index].page, &offset,
873 GFP_NOIO | __GFP_HIGHMEM)) {
874 mutex_unlock(&rzs->lock);
875 pr_info("Error allocating memory for compressed "
876 "page: %u, size=%zu\n", index, clen);
877 rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
878 if (rzs->backing_swap)
879 fwd_write_request = 1;
884 rzs->table[index].offset = offset;
886 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
887 rzs->table[index].offset;
890 /* Back-reference needed for memory defragmentation */
891 if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
892 zheader = (struct zobj_header *)cmem;
893 zheader->table_idx = index;
894 cmem += sizeof(*zheader);
898 memcpy(cmem, src, clen);
900 kunmap_atomic(cmem, KM_USER1);
901 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
902 kunmap_atomic(src, KM_USER0);
905 rzs->stats.compr_size += clen;
906 rzs_stat_inc(&rzs->stats.pages_stored);
907 if (clen <= PAGE_SIZE / 2)
908 rzs_stat_inc(&rzs->stats.good_compress);
910 mutex_unlock(&rzs->lock);
912 set_bit(BIO_UPTODATE, &bio->bi_flags);
917 if (fwd_write_request) {
918 rzs_stat64_inc(rzs, &rzs->stats.bdev_num_writes);
919 bio->bi_bdev = rzs->backing_swap;
922 * TODO: We currently have linear mapping of ramzswap and
923 * backing swap sectors. This is not desired since we want
924 * to optimize writes to backing swap to minimize disk seeks
925 * or have effective wear leveling (for SSDs). Also, a
926 * non-linear mapping is required to implement compressed
929 bio->bi_sector = get_backing_swap_page()
930 << SECTORS_PER_PAGE_SHIFT;
933 * In case backing swap is a file, find the right offset within
934 * the file corresponding to logical position 'index'. For block
935 * device, this is a nop.
937 bio->bi_sector = map_backing_swap_page(rzs, index)
938 << SECTORS_PER_PAGE_SHIFT;
947 * Check if request is within bounds and page aligned.
949 static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
952 (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
953 (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
954 (bio->bi_vcnt != 1) ||
955 (bio->bi_size != PAGE_SIZE) ||
956 (bio->bi_io_vec[0].bv_offset != 0))) {
961 /* swap request is valid */
966 * Handler function for all ramzswap I/O requests.
968 static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
971 struct ramzswap *rzs = queue->queuedata;
973 if (unlikely(!rzs->init_done)) {
978 if (!valid_swap_request(rzs, bio)) {
979 rzs_stat64_inc(rzs, &rzs->stats.invalid_io);
984 switch (bio_data_dir(bio)) {
986 ret = ramzswap_read(rzs, bio);
990 ret = ramzswap_write(rzs, bio);
997 static void reset_device(struct ramzswap *rzs)
999 int is_backing_blkdev = 0;
1000 size_t index, num_pages;
1001 unsigned entries_per_page;
1002 unsigned long num_table_pages, entry = 0;
1004 /* Do not accept any new I/O request */
1007 if (rzs->backing_swap && !rzs->num_extents)
1008 is_backing_blkdev = 1;
1010 num_pages = rzs->disksize >> PAGE_SHIFT;
1012 /* Free various per-device buffers */
1013 kfree(rzs->compress_workmem);
1014 free_pages((unsigned long)rzs->compress_buffer, 1);
1016 rzs->compress_workmem = NULL;
1017 rzs->compress_buffer = NULL;
1019 /* Free all pages that are still in this ramzswap device */
1020 for (index = 0; index < num_pages; index++) {
1024 page = rzs->table[index].page;
1025 offset = rzs->table[index].offset;
1030 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1033 xv_free(rzs->mem_pool, page, offset);
1036 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
1037 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
1040 * Set page->mapping to NULL for every table page.
1041 * Otherwise, we will hit bad_page() during free.
1043 while (rzs->num_extents && num_table_pages--) {
1045 page = vmalloc_to_page(&rzs->table[entry]);
1046 page->mapping = NULL;
1047 entry += entries_per_page;
1052 xv_destroy_pool(rzs->mem_pool);
1053 rzs->mem_pool = NULL;
1055 /* Free all swap extent pages */
1056 while (!list_empty(&rzs->backing_swap_extent_list)) {
1058 struct list_head *entry;
1059 entry = rzs->backing_swap_extent_list.next;
1060 page = list_entry(entry, struct page, lru);
1064 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1065 rzs->num_extents = 0;
1067 /* Close backing swap device, if present */
1068 if (rzs->backing_swap) {
1069 if (is_backing_blkdev)
1070 bd_release(rzs->backing_swap);
1071 filp_close(rzs->swap_file, NULL);
1072 rzs->backing_swap = NULL;
1073 memset(rzs->backing_swap_name, 0, MAX_SWAP_NAME_LEN);
1077 memset(&rzs->stats, 0, sizeof(rzs->stats));
1083 static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
1088 union swap_header *swap_header;
1090 if (rzs->init_done) {
1091 pr_info("Device already initialized!\n");
1095 ret = setup_backing_swap(rzs);
1099 if (rzs->backing_swap)
1100 ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
1102 ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
1104 rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
1105 if (!rzs->compress_workmem) {
1106 pr_err("Error allocating compressor working memory!\n");
1111 rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
1112 if (!rzs->compress_buffer) {
1113 pr_err("Error allocating compressor buffer space\n");
1118 num_pages = rzs->disksize >> PAGE_SHIFT;
1119 rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
1121 pr_err("Error allocating ramzswap address table\n");
1122 /* To prevent accessing table entries during cleanup */
1127 memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
1129 map_backing_swap_extents(rzs);
1131 page = alloc_page(__GFP_ZERO);
1133 pr_err("Error allocating swap header page\n");
1137 rzs->table[0].page = page;
1138 rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
1140 swap_header = kmap(page);
1141 ret = setup_swap_header(rzs, swap_header);
1144 pr_err("Error setting swap header\n");
1148 set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
1151 * We have ident mapping of sectors for ramzswap and
1152 * and the backing swap device. So, this queue flag
1153 * should be according to backing dev.
1155 if (!rzs->backing_swap ||
1156 blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
1157 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
1159 rzs->mem_pool = xv_create_pool();
1160 if (!rzs->mem_pool) {
1161 pr_err("Error creating memory pool\n");
1167 * Pages that compress to size greater than this are forwarded
1168 * to physical swap disk (if backing dev is provided)
1169 * TODO: make this configurable
1171 if (rzs->backing_swap)
1172 max_zpage_size = max_zpage_size_bdev;
1174 max_zpage_size = max_zpage_size_nobdev;
1175 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
1179 pr_debug("Initialization done!\n");
1185 pr_err("Initialization failed: err=%d\n", ret);
1189 static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
1197 static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
1198 unsigned int cmd, unsigned long arg)
1201 size_t disksize_kb, memlimit_kb;
1203 struct ramzswap *rzs = bdev->bd_disk->private_data;
1206 case RZSIO_SET_DISKSIZE_KB:
1207 if (rzs->init_done) {
1211 if (copy_from_user(&disksize_kb, (void *)arg,
1216 rzs->disksize = disksize_kb << 10;
1217 pr_info("Disk size set to %zu kB\n", disksize_kb);
1220 case RZSIO_SET_MEMLIMIT_KB:
1221 if (rzs->init_done) {
1222 /* TODO: allow changing memlimit */
1226 if (copy_from_user(&memlimit_kb, (void *)arg,
1231 rzs->memlimit = memlimit_kb << 10;
1232 pr_info("Memory limit set to %zu kB\n", memlimit_kb);
1235 case RZSIO_SET_BACKING_SWAP:
1236 if (rzs->init_done) {
1241 if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
1246 rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1247 pr_info("Backing swap set to %s\n", rzs->backing_swap_name);
1250 case RZSIO_GET_STATS:
1252 struct ramzswap_ioctl_stats *stats;
1253 if (!rzs->init_done) {
1257 stats = kzalloc(sizeof(*stats), GFP_KERNEL);
1262 ramzswap_ioctl_get_stats(rzs, stats);
1263 if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
1272 ret = ramzswap_ioctl_init_device(rzs);
1276 /* Do not reset an active device! */
1277 if (bdev->bd_holders) {
1282 /* Make sure all pending I/O is finished */
1286 ret = ramzswap_ioctl_reset_device(rzs);
1290 pr_info("Invalid ioctl %u\n", cmd);
1298 static struct block_device_operations ramzswap_devops = {
1299 .ioctl = ramzswap_ioctl,
1300 .owner = THIS_MODULE,
1303 static int create_device(struct ramzswap *rzs, int device_id)
1307 mutex_init(&rzs->lock);
1308 spin_lock_init(&rzs->stat64_lock);
1309 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1311 rzs->queue = blk_alloc_queue(GFP_KERNEL);
1313 pr_err("Error allocating disk queue for device %d\n",
1319 blk_queue_make_request(rzs->queue, ramzswap_make_request);
1320 rzs->queue->queuedata = rzs;
1322 /* gendisk structure */
1323 rzs->disk = alloc_disk(1);
1325 blk_cleanup_queue(rzs->queue);
1326 pr_warning("Error allocating disk structure for device %d\n",
1332 rzs->disk->major = ramzswap_major;
1333 rzs->disk->first_minor = device_id;
1334 rzs->disk->fops = &ramzswap_devops;
1335 rzs->disk->queue = rzs->queue;
1336 rzs->disk->private_data = rzs;
1337 snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
1340 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1341 * or set equal to backing swap device (if provided)
1343 set_capacity(rzs->disk, 0);
1345 blk_queue_physical_block_size(rzs->disk->queue, PAGE_SIZE);
1346 blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE);
1348 add_disk(rzs->disk);
1356 static void destroy_device(struct ramzswap *rzs)
1359 del_gendisk(rzs->disk);
1360 put_disk(rzs->disk);
1364 blk_cleanup_queue(rzs->queue);
1367 static int __init ramzswap_init(void)
1371 if (num_devices > max_num_devices) {
1372 pr_warning("Invalid value for num_devices: %u\n",
1378 ramzswap_major = register_blkdev(0, "ramzswap");
1379 if (ramzswap_major <= 0) {
1380 pr_warning("Unable to get major number\n");
1386 pr_info("num_devices not specified. Using default: 1\n");
1390 /* Allocate the device array and initialize each one */
1391 pr_info("Creating %u devices ...\n", num_devices);
1392 devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
1398 for (dev_id = 0; dev_id < num_devices; dev_id++) {
1399 ret = create_device(&devices[dev_id], dev_id);
1408 destroy_device(&devices[--dev_id]);
1410 unregister_blkdev(ramzswap_major, "ramzswap");
1415 static void __exit ramzswap_exit(void)
1418 struct ramzswap *rzs;
1420 for (i = 0; i < num_devices; i++) {
1423 destroy_device(rzs);
1428 unregister_blkdev(ramzswap_major, "ramzswap");
1431 pr_debug("Cleanup done!\n");
1434 module_param(num_devices, uint, 0);
1435 MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
1437 module_init(ramzswap_init);
1438 module_exit(ramzswap_exit);
1440 MODULE_LICENSE("Dual BSD/GPL");
1441 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1442 MODULE_DESCRIPTION("Compressed RAM Based Swap Device");