]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - mm/swap.c
7434e3619c14108181dab183b8c532d6bddecec0
[karo-tx-linux.git] / mm / swap.c
1 /*
2  *  linux/mm/swap.c
3  *
4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5  */
6
7 /*
8  * This file contains the default values for the operation of the
9  * Linux VM subsystem. Fine-tuning documentation can be found in
10  * Documentation/sysctl/vm.txt.
11  * Started 18.12.91
12  * Swap aging added 23.2.95, Stephen Tweedie.
13  * Buffermem limits added 12.3.98, Rik van Riel.
14  */
15
16 #include <linux/mm.h>
17 #include <linux/sched.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/mman.h>
21 #include <linux/pagemap.h>
22 #include <linux/pagevec.h>
23 #include <linux/init.h>
24 #include <linux/export.h>
25 #include <linux/mm_inline.h>
26 #include <linux/percpu_counter.h>
27 #include <linux/percpu.h>
28 #include <linux/cpu.h>
29 #include <linux/notifier.h>
30 #include <linux/backing-dev.h>
31 #include <linux/memcontrol.h>
32 #include <linux/gfp.h>
33 #include <linux/uio.h>
34
35 #include "internal.h"
36
37 #define CREATE_TRACE_POINTS
38 #include <trace/events/pagemap.h>
39
40 /* How many pages do we try to swap or page in/out together? */
41 int page_cluster;
42
43 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
44 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
45 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
46
47 /*
48  * This path almost never happens for VM activity - pages are normally
49  * freed via pagevecs.  But it gets used by networking.
50  */
51 static void __page_cache_release(struct page *page)
52 {
53         if (PageLRU(page)) {
54                 struct zone *zone = page_zone(page);
55                 struct lruvec *lruvec;
56                 unsigned long flags;
57
58                 spin_lock_irqsave(&zone->lru_lock, flags);
59                 lruvec = mem_cgroup_page_lruvec(page, zone);
60                 VM_BUG_ON(!PageLRU(page));
61                 __ClearPageLRU(page);
62                 del_page_from_lru_list(page, lruvec, page_off_lru(page));
63                 spin_unlock_irqrestore(&zone->lru_lock, flags);
64         }
65 }
66
67 static void __put_single_page(struct page *page)
68 {
69         __page_cache_release(page);
70         free_hot_cold_page(page, 0);
71 }
72
73 static void __put_compound_page(struct page *page)
74 {
75         compound_page_dtor *dtor;
76
77         __page_cache_release(page);
78         dtor = get_compound_page_dtor(page);
79         (*dtor)(page);
80 }
81
82 static void put_compound_page(struct page *page)
83 {
84         if (unlikely(PageTail(page))) {
85                 /* __split_huge_page_refcount can run under us */
86                 struct page *page_head = compound_trans_head(page);
87
88                 /*
89                  * THP can not break up slab pages so avoid taking
90                  * compound_lock() and skip the tail page refcounting
91                  * (in _mapcount) too. Slab performs non-atomic bit
92                  * ops on page->flags for better performance. In
93                  * particular slab_unlock() in slub used to be a hot
94                  * path. It is still hot on arches that do not support
95                  * this_cpu_cmpxchg_double().
96                  *
97                  * If "page" is part of a slab or hugetlbfs page it
98                  * cannot be splitted and the head page cannot change
99                  * from under us. And if "page" is part of a THP page
100                  * under splitting, if the head page pointed by the
101                  * THP tail isn't a THP head anymore, we'll find
102                  * PageTail clear after smp_rmb() and we'll treat it
103                  * as a single page.
104                  */
105                 if (!__compound_tail_refcounted(page_head)) {
106                         /*
107                          * If "page" is a THP tail, we must read the tail page
108                          * flags after the head page flags. The
109                          * split_huge_page side enforces write memory
110                          * barriers between clearing PageTail and before the
111                          * head page can be freed and reallocated.
112                          */
113                         smp_rmb();
114                         if (likely(PageTail(page))) {
115                                 /*
116                                  * __split_huge_page_refcount
117                                  * cannot race here.
118                                  */
119                                 VM_BUG_ON(!PageHead(page_head));
120                                 VM_BUG_ON(page_mapcount(page) != 0);
121                                 if (put_page_testzero(page_head)) {
122                                         /*
123                                          * If this is the tail of a
124                                          * slab compound page, the
125                                          * tail pin must not be the
126                                          * last reference held on the
127                                          * page, because the PG_slab
128                                          * cannot be cleared before
129                                          * all tail pins (which skips
130                                          * the _mapcount tail
131                                          * refcounting) have been
132                                          * released. For hugetlbfs the
133                                          * tail pin may be the last
134                                          * reference on the page
135                                          * instead, because
136                                          * PageHeadHuge will not go
137                                          * away until the compound
138                                          * page enters the buddy
139                                          * allocator.
140                                          */
141                                         VM_BUG_ON(PageSlab(page_head));
142                                         __put_compound_page(page_head);
143                                 }
144                                 return;
145                         } else
146                                 /*
147                                  * __split_huge_page_refcount
148                                  * run before us, "page" was a
149                                  * THP tail. The split
150                                  * page_head has been freed
151                                  * and reallocated as slab or
152                                  * hugetlbfs page of smaller
153                                  * order (only possible if
154                                  * reallocated as slab on
155                                  * x86).
156                                  */
157                                 goto out_put_single;
158                 }
159
160                 if (likely(page != page_head &&
161                            get_page_unless_zero(page_head))) {
162                         unsigned long flags;
163
164                         /*
165                          * page_head wasn't a dangling pointer but it
166                          * may not be a head page anymore by the time
167                          * we obtain the lock. That is ok as long as it
168                          * can't be freed from under us.
169                          */
170                         flags = compound_lock_irqsave(page_head);
171                         if (unlikely(!PageTail(page))) {
172                                 /* __split_huge_page_refcount run before us */
173                                 compound_unlock_irqrestore(page_head, flags);
174                                 if (put_page_testzero(page_head)) {
175                                         /*
176                                          * The head page may have been
177                                          * freed and reallocated as a
178                                          * compound page of smaller
179                                          * order and then freed again.
180                                          * All we know is that it
181                                          * cannot have become: a THP
182                                          * page, a compound page of
183                                          * higher order, a tail page.
184                                          * That is because we still
185                                          * hold the refcount of the
186                                          * split THP tail and
187                                          * page_head was the THP head
188                                          * before the split.
189                                          */
190                                         if (PageHead(page_head))
191                                                 __put_compound_page(page_head);
192                                         else
193                                                 __put_single_page(page_head);
194                                 }
195 out_put_single:
196                                 if (put_page_testzero(page))
197                                         __put_single_page(page);
198                                 return;
199                         }
200                         VM_BUG_ON(page_head != page->first_page);
201                         /*
202                          * We can release the refcount taken by
203                          * get_page_unless_zero() now that
204                          * __split_huge_page_refcount() is blocked on
205                          * the compound_lock.
206                          */
207                         if (put_page_testzero(page_head))
208                                 VM_BUG_ON(1);
209                         /* __split_huge_page_refcount will wait now */
210                         VM_BUG_ON(page_mapcount(page) <= 0);
211                         atomic_dec(&page->_mapcount);
212                         VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
213                         VM_BUG_ON(atomic_read(&page->_count) != 0);
214                         compound_unlock_irqrestore(page_head, flags);
215
216                         if (put_page_testzero(page_head)) {
217                                 if (PageHead(page_head))
218                                         __put_compound_page(page_head);
219                                 else
220                                         __put_single_page(page_head);
221                         }
222                 } else {
223                         /* page_head is a dangling pointer */
224                         VM_BUG_ON(PageTail(page));
225                         goto out_put_single;
226                 }
227         } else if (put_page_testzero(page)) {
228                 if (PageHead(page))
229                         __put_compound_page(page);
230                 else
231                         __put_single_page(page);
232         }
233 }
234
235 void put_page(struct page *page)
236 {
237         if (unlikely(PageCompound(page)))
238                 put_compound_page(page);
239         else if (put_page_testzero(page))
240                 __put_single_page(page);
241 }
242 EXPORT_SYMBOL(put_page);
243
244 /*
245  * This function is exported but must not be called by anything other
246  * than get_page(). It implements the slow path of get_page().
247  */
248 bool __get_page_tail(struct page *page)
249 {
250         /*
251          * This takes care of get_page() if run on a tail page
252          * returned by one of the get_user_pages/follow_page variants.
253          * get_user_pages/follow_page itself doesn't need the compound
254          * lock because it runs __get_page_tail_foll() under the
255          * proper PT lock that already serializes against
256          * split_huge_page().
257          */
258         unsigned long flags;
259         bool got;
260         struct page *page_head = compound_trans_head(page);
261
262         /* Ref to put_compound_page() comment. */
263         if (!__compound_tail_refcounted(page_head)) {
264                 smp_rmb();
265                 if (likely(PageTail(page))) {
266                         /*
267                          * This is a hugetlbfs page or a slab
268                          * page. __split_huge_page_refcount
269                          * cannot race here.
270                          */
271                         VM_BUG_ON(!PageHead(page_head));
272                         __get_page_tail_foll(page, true);
273                         return true;
274                 } else {
275                         /*
276                          * __split_huge_page_refcount run
277                          * before us, "page" was a THP
278                          * tail. The split page_head has been
279                          * freed and reallocated as slab or
280                          * hugetlbfs page of smaller order
281                          * (only possible if reallocated as
282                          * slab on x86).
283                          */
284                         return false;
285                 }
286         }
287
288         got = false;
289         if (likely(page != page_head && get_page_unless_zero(page_head))) {
290                 /*
291                  * page_head wasn't a dangling pointer but it
292                  * may not be a head page anymore by the time
293                  * we obtain the lock. That is ok as long as it
294                  * can't be freed from under us.
295                  */
296                 flags = compound_lock_irqsave(page_head);
297                 /* here __split_huge_page_refcount won't run anymore */
298                 if (likely(PageTail(page))) {
299                         __get_page_tail_foll(page, false);
300                         got = true;
301                 }
302                 compound_unlock_irqrestore(page_head, flags);
303                 if (unlikely(!got))
304                         put_page(page_head);
305         }
306         return got;
307 }
308 EXPORT_SYMBOL(__get_page_tail);
309
310 /**
311  * put_pages_list() - release a list of pages
312  * @pages: list of pages threaded on page->lru
313  *
314  * Release a list of pages which are strung together on page.lru.  Currently
315  * used by read_cache_pages() and related error recovery code.
316  */
317 void put_pages_list(struct list_head *pages)
318 {
319         while (!list_empty(pages)) {
320                 struct page *victim;
321
322                 victim = list_entry(pages->prev, struct page, lru);
323                 list_del(&victim->lru);
324                 page_cache_release(victim);
325         }
326 }
327 EXPORT_SYMBOL(put_pages_list);
328
329 /*
330  * get_kernel_pages() - pin kernel pages in memory
331  * @kiov:       An array of struct kvec structures
332  * @nr_segs:    number of segments to pin
333  * @write:      pinning for read/write, currently ignored
334  * @pages:      array that receives pointers to the pages pinned.
335  *              Should be at least nr_segs long.
336  *
337  * Returns number of pages pinned. This may be fewer than the number
338  * requested. If nr_pages is 0 or negative, returns 0. If no pages
339  * were pinned, returns -errno. Each page returned must be released
340  * with a put_page() call when it is finished with.
341  */
342 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
343                 struct page **pages)
344 {
345         int seg;
346
347         for (seg = 0; seg < nr_segs; seg++) {
348                 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
349                         return seg;
350
351                 pages[seg] = kmap_to_page(kiov[seg].iov_base);
352                 page_cache_get(pages[seg]);
353         }
354
355         return seg;
356 }
357 EXPORT_SYMBOL_GPL(get_kernel_pages);
358
359 /*
360  * get_kernel_page() - pin a kernel page in memory
361  * @start:      starting kernel address
362  * @write:      pinning for read/write, currently ignored
363  * @pages:      array that receives pointer to the page pinned.
364  *              Must be at least nr_segs long.
365  *
366  * Returns 1 if page is pinned. If the page was not pinned, returns
367  * -errno. The page returned must be released with a put_page() call
368  * when it is finished with.
369  */
370 int get_kernel_page(unsigned long start, int write, struct page **pages)
371 {
372         const struct kvec kiov = {
373                 .iov_base = (void *)start,
374                 .iov_len = PAGE_SIZE
375         };
376
377         return get_kernel_pages(&kiov, 1, write, pages);
378 }
379 EXPORT_SYMBOL_GPL(get_kernel_page);
380
381 static void pagevec_lru_move_fn(struct pagevec *pvec,
382         void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
383         void *arg)
384 {
385         int i;
386         struct zone *zone = NULL;
387         struct lruvec *lruvec;
388         unsigned long flags = 0;
389
390         for (i = 0; i < pagevec_count(pvec); i++) {
391                 struct page *page = pvec->pages[i];
392                 struct zone *pagezone = page_zone(page);
393
394                 if (pagezone != zone) {
395                         if (zone)
396                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
397                         zone = pagezone;
398                         spin_lock_irqsave(&zone->lru_lock, flags);
399                 }
400
401                 lruvec = mem_cgroup_page_lruvec(page, zone);
402                 (*move_fn)(page, lruvec, arg);
403         }
404         if (zone)
405                 spin_unlock_irqrestore(&zone->lru_lock, flags);
406         release_pages(pvec->pages, pvec->nr, pvec->cold);
407         pagevec_reinit(pvec);
408 }
409
410 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
411                                  void *arg)
412 {
413         int *pgmoved = arg;
414
415         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
416                 enum lru_list lru = page_lru_base_type(page);
417                 list_move_tail(&page->lru, &lruvec->lists[lru]);
418                 (*pgmoved)++;
419         }
420 }
421
422 /*
423  * pagevec_move_tail() must be called with IRQ disabled.
424  * Otherwise this may cause nasty races.
425  */
426 static void pagevec_move_tail(struct pagevec *pvec)
427 {
428         int pgmoved = 0;
429
430         pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
431         __count_vm_events(PGROTATED, pgmoved);
432 }
433
434 /*
435  * Writeback is about to end against a page which has been marked for immediate
436  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
437  * inactive list.
438  */
439 void rotate_reclaimable_page(struct page *page)
440 {
441         if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
442             !PageUnevictable(page) && PageLRU(page)) {
443                 struct pagevec *pvec;
444                 unsigned long flags;
445
446                 page_cache_get(page);
447                 local_irq_save(flags);
448                 pvec = &__get_cpu_var(lru_rotate_pvecs);
449                 if (!pagevec_add(pvec, page))
450                         pagevec_move_tail(pvec);
451                 local_irq_restore(flags);
452         }
453 }
454
455 static void update_page_reclaim_stat(struct lruvec *lruvec,
456                                      int file, int rotated)
457 {
458         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
459
460         reclaim_stat->recent_scanned[file]++;
461         if (rotated)
462                 reclaim_stat->recent_rotated[file]++;
463 }
464
465 static void __activate_page(struct page *page, struct lruvec *lruvec,
466                             void *arg)
467 {
468         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
469                 int file = page_is_file_cache(page);
470                 int lru = page_lru_base_type(page);
471
472                 del_page_from_lru_list(page, lruvec, lru);
473                 SetPageActive(page);
474                 lru += LRU_ACTIVE;
475                 add_page_to_lru_list(page, lruvec, lru);
476                 trace_mm_lru_activate(page, page_to_pfn(page));
477
478                 __count_vm_event(PGACTIVATE);
479                 update_page_reclaim_stat(lruvec, file, 1);
480         }
481 }
482
483 #ifdef CONFIG_SMP
484 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
485
486 static void activate_page_drain(int cpu)
487 {
488         struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
489
490         if (pagevec_count(pvec))
491                 pagevec_lru_move_fn(pvec, __activate_page, NULL);
492 }
493
494 static bool need_activate_page_drain(int cpu)
495 {
496         return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
497 }
498
499 void activate_page(struct page *page)
500 {
501         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
502                 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
503
504                 page_cache_get(page);
505                 if (!pagevec_add(pvec, page))
506                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
507                 put_cpu_var(activate_page_pvecs);
508         }
509 }
510
511 #else
512 static inline void activate_page_drain(int cpu)
513 {
514 }
515
516 static bool need_activate_page_drain(int cpu)
517 {
518         return false;
519 }
520
521 void activate_page(struct page *page)
522 {
523         struct zone *zone = page_zone(page);
524
525         spin_lock_irq(&zone->lru_lock);
526         __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
527         spin_unlock_irq(&zone->lru_lock);
528 }
529 #endif
530
531 static void __lru_cache_activate_page(struct page *page)
532 {
533         struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
534         int i;
535
536         /*
537          * Search backwards on the optimistic assumption that the page being
538          * activated has just been added to this pagevec. Note that only
539          * the local pagevec is examined as a !PageLRU page could be in the
540          * process of being released, reclaimed, migrated or on a remote
541          * pagevec that is currently being drained. Furthermore, marking
542          * a remote pagevec's page PageActive potentially hits a race where
543          * a page is marked PageActive just after it is added to the inactive
544          * list causing accounting errors and BUG_ON checks to trigger.
545          */
546         for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
547                 struct page *pagevec_page = pvec->pages[i];
548
549                 if (pagevec_page == page) {
550                         SetPageActive(page);
551                         break;
552                 }
553         }
554
555         put_cpu_var(lru_add_pvec);
556 }
557
558 /*
559  * Mark a page as having seen activity.
560  *
561  * inactive,unreferenced        ->      inactive,referenced
562  * inactive,referenced          ->      active,unreferenced
563  * active,unreferenced          ->      active,referenced
564  */
565 void mark_page_accessed(struct page *page)
566 {
567         if (!PageActive(page) && !PageUnevictable(page) &&
568                         PageReferenced(page)) {
569
570                 /*
571                  * If the page is on the LRU, queue it for activation via
572                  * activate_page_pvecs. Otherwise, assume the page is on a
573                  * pagevec, mark it active and it'll be moved to the active
574                  * LRU on the next drain.
575                  */
576                 if (PageLRU(page))
577                         activate_page(page);
578                 else
579                         __lru_cache_activate_page(page);
580                 ClearPageReferenced(page);
581         } else if (!PageReferenced(page)) {
582                 SetPageReferenced(page);
583         }
584 }
585 EXPORT_SYMBOL(mark_page_accessed);
586
587 /*
588  * Queue the page for addition to the LRU via pagevec. The decision on whether
589  * to add the page to the [in]active [file|anon] list is deferred until the
590  * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
591  * have the page added to the active list using mark_page_accessed().
592  */
593 void __lru_cache_add(struct page *page)
594 {
595         struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
596
597         page_cache_get(page);
598         if (!pagevec_space(pvec))
599                 __pagevec_lru_add(pvec);
600         pagevec_add(pvec, page);
601         put_cpu_var(lru_add_pvec);
602 }
603 EXPORT_SYMBOL(__lru_cache_add);
604
605 /**
606  * lru_cache_add - add a page to a page list
607  * @page: the page to be added to the LRU.
608  */
609 void lru_cache_add(struct page *page)
610 {
611         VM_BUG_ON(PageActive(page) && PageUnevictable(page));
612         VM_BUG_ON(PageLRU(page));
613         __lru_cache_add(page);
614 }
615
616 /**
617  * add_page_to_unevictable_list - add a page to the unevictable list
618  * @page:  the page to be added to the unevictable list
619  *
620  * Add page directly to its zone's unevictable list.  To avoid races with
621  * tasks that might be making the page evictable, through eg. munlock,
622  * munmap or exit, while it's not on the lru, we want to add the page
623  * while it's locked or otherwise "invisible" to other tasks.  This is
624  * difficult to do when using the pagevec cache, so bypass that.
625  */
626 void add_page_to_unevictable_list(struct page *page)
627 {
628         struct zone *zone = page_zone(page);
629         struct lruvec *lruvec;
630
631         spin_lock_irq(&zone->lru_lock);
632         lruvec = mem_cgroup_page_lruvec(page, zone);
633         ClearPageActive(page);
634         SetPageUnevictable(page);
635         SetPageLRU(page);
636         add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
637         spin_unlock_irq(&zone->lru_lock);
638 }
639
640 /*
641  * If the page can not be invalidated, it is moved to the
642  * inactive list to speed up its reclaim.  It is moved to the
643  * head of the list, rather than the tail, to give the flusher
644  * threads some time to write it out, as this is much more
645  * effective than the single-page writeout from reclaim.
646  *
647  * If the page isn't page_mapped and dirty/writeback, the page
648  * could reclaim asap using PG_reclaim.
649  *
650  * 1. active, mapped page -> none
651  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
652  * 3. inactive, mapped page -> none
653  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
654  * 5. inactive, clean -> inactive, tail
655  * 6. Others -> none
656  *
657  * In 4, why it moves inactive's head, the VM expects the page would
658  * be write it out by flusher threads as this is much more effective
659  * than the single-page writeout from reclaim.
660  */
661 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
662                               void *arg)
663 {
664         int lru, file;
665         bool active;
666
667         if (!PageLRU(page))
668                 return;
669
670         if (PageUnevictable(page))
671                 return;
672
673         /* Some processes are using the page */
674         if (page_mapped(page))
675                 return;
676
677         active = PageActive(page);
678         file = page_is_file_cache(page);
679         lru = page_lru_base_type(page);
680
681         del_page_from_lru_list(page, lruvec, lru + active);
682         ClearPageActive(page);
683         ClearPageReferenced(page);
684         add_page_to_lru_list(page, lruvec, lru);
685
686         if (PageWriteback(page) || PageDirty(page)) {
687                 /*
688                  * PG_reclaim could be raced with end_page_writeback
689                  * It can make readahead confusing.  But race window
690                  * is _really_ small and  it's non-critical problem.
691                  */
692                 SetPageReclaim(page);
693         } else {
694                 /*
695                  * The page's writeback ends up during pagevec
696                  * We moves tha page into tail of inactive.
697                  */
698                 list_move_tail(&page->lru, &lruvec->lists[lru]);
699                 __count_vm_event(PGROTATED);
700         }
701
702         if (active)
703                 __count_vm_event(PGDEACTIVATE);
704         update_page_reclaim_stat(lruvec, file, 0);
705 }
706
707 /*
708  * Drain pages out of the cpu's pagevecs.
709  * Either "cpu" is the current CPU, and preemption has already been
710  * disabled; or "cpu" is being hot-unplugged, and is already dead.
711  */
712 void lru_add_drain_cpu(int cpu)
713 {
714         struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
715
716         if (pagevec_count(pvec))
717                 __pagevec_lru_add(pvec);
718
719         pvec = &per_cpu(lru_rotate_pvecs, cpu);
720         if (pagevec_count(pvec)) {
721                 unsigned long flags;
722
723                 /* No harm done if a racing interrupt already did this */
724                 local_irq_save(flags);
725                 pagevec_move_tail(pvec);
726                 local_irq_restore(flags);
727         }
728
729         pvec = &per_cpu(lru_deactivate_pvecs, cpu);
730         if (pagevec_count(pvec))
731                 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
732
733         activate_page_drain(cpu);
734 }
735
736 /**
737  * deactivate_page - forcefully deactivate a page
738  * @page: page to deactivate
739  *
740  * This function hints the VM that @page is a good reclaim candidate,
741  * for example if its invalidation fails due to the page being dirty
742  * or under writeback.
743  */
744 void deactivate_page(struct page *page)
745 {
746         /*
747          * In a workload with many unevictable page such as mprotect, unevictable
748          * page deactivation for accelerating reclaim is pointless.
749          */
750         if (PageUnevictable(page))
751                 return;
752
753         if (likely(get_page_unless_zero(page))) {
754                 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
755
756                 if (!pagevec_add(pvec, page))
757                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
758                 put_cpu_var(lru_deactivate_pvecs);
759         }
760 }
761
762 void lru_add_drain(void)
763 {
764         lru_add_drain_cpu(get_cpu());
765         put_cpu();
766 }
767
768 static void lru_add_drain_per_cpu(struct work_struct *dummy)
769 {
770         lru_add_drain();
771 }
772
773 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
774
775 void lru_add_drain_all(void)
776 {
777         static DEFINE_MUTEX(lock);
778         static struct cpumask has_work;
779         int cpu;
780
781         mutex_lock(&lock);
782         get_online_cpus();
783         cpumask_clear(&has_work);
784
785         for_each_online_cpu(cpu) {
786                 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
787
788                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
789                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
790                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
791                     need_activate_page_drain(cpu)) {
792                         INIT_WORK(work, lru_add_drain_per_cpu);
793                         schedule_work_on(cpu, work);
794                         cpumask_set_cpu(cpu, &has_work);
795                 }
796         }
797
798         for_each_cpu(cpu, &has_work)
799                 flush_work(&per_cpu(lru_add_drain_work, cpu));
800
801         put_online_cpus();
802         mutex_unlock(&lock);
803 }
804
805 /*
806  * Batched page_cache_release().  Decrement the reference count on all the
807  * passed pages.  If it fell to zero then remove the page from the LRU and
808  * free it.
809  *
810  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
811  * for the remainder of the operation.
812  *
813  * The locking in this function is against shrink_inactive_list(): we recheck
814  * the page count inside the lock to see whether shrink_inactive_list()
815  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
816  * will free it.
817  */
818 void release_pages(struct page **pages, int nr, int cold)
819 {
820         int i;
821         LIST_HEAD(pages_to_free);
822         struct zone *zone = NULL;
823         struct lruvec *lruvec;
824         unsigned long uninitialized_var(flags);
825
826         for (i = 0; i < nr; i++) {
827                 struct page *page = pages[i];
828
829                 if (unlikely(PageCompound(page))) {
830                         if (zone) {
831                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
832                                 zone = NULL;
833                         }
834                         put_compound_page(page);
835                         continue;
836                 }
837
838                 if (!put_page_testzero(page))
839                         continue;
840
841                 if (PageLRU(page)) {
842                         struct zone *pagezone = page_zone(page);
843
844                         if (pagezone != zone) {
845                                 if (zone)
846                                         spin_unlock_irqrestore(&zone->lru_lock,
847                                                                         flags);
848                                 zone = pagezone;
849                                 spin_lock_irqsave(&zone->lru_lock, flags);
850                         }
851
852                         lruvec = mem_cgroup_page_lruvec(page, zone);
853                         VM_BUG_ON(!PageLRU(page));
854                         __ClearPageLRU(page);
855                         del_page_from_lru_list(page, lruvec, page_off_lru(page));
856                 }
857
858                 /* Clear Active bit in case of parallel mark_page_accessed */
859                 ClearPageActive(page);
860
861                 list_add(&page->lru, &pages_to_free);
862         }
863         if (zone)
864                 spin_unlock_irqrestore(&zone->lru_lock, flags);
865
866         free_hot_cold_page_list(&pages_to_free, cold);
867 }
868 EXPORT_SYMBOL(release_pages);
869
870 /*
871  * The pages which we're about to release may be in the deferred lru-addition
872  * queues.  That would prevent them from really being freed right now.  That's
873  * OK from a correctness point of view but is inefficient - those pages may be
874  * cache-warm and we want to give them back to the page allocator ASAP.
875  *
876  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
877  * and __pagevec_lru_add_active() call release_pages() directly to avoid
878  * mutual recursion.
879  */
880 void __pagevec_release(struct pagevec *pvec)
881 {
882         lru_add_drain();
883         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
884         pagevec_reinit(pvec);
885 }
886 EXPORT_SYMBOL(__pagevec_release);
887
888 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
889 /* used by __split_huge_page_refcount() */
890 void lru_add_page_tail(struct page *page, struct page *page_tail,
891                        struct lruvec *lruvec, struct list_head *list)
892 {
893         const int file = 0;
894
895         VM_BUG_ON(!PageHead(page));
896         VM_BUG_ON(PageCompound(page_tail));
897         VM_BUG_ON(PageLRU(page_tail));
898         VM_BUG_ON(NR_CPUS != 1 &&
899                   !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
900
901         if (!list)
902                 SetPageLRU(page_tail);
903
904         if (likely(PageLRU(page)))
905                 list_add_tail(&page_tail->lru, &page->lru);
906         else if (list) {
907                 /* page reclaim is reclaiming a huge page */
908                 get_page(page_tail);
909                 list_add_tail(&page_tail->lru, list);
910         } else {
911                 struct list_head *list_head;
912                 /*
913                  * Head page has not yet been counted, as an hpage,
914                  * so we must account for each subpage individually.
915                  *
916                  * Use the standard add function to put page_tail on the list,
917                  * but then correct its position so they all end up in order.
918                  */
919                 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
920                 list_head = page_tail->lru.prev;
921                 list_move_tail(&page_tail->lru, list_head);
922         }
923
924         if (!PageUnevictable(page))
925                 update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
926 }
927 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
928
929 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
930                                  void *arg)
931 {
932         int file = page_is_file_cache(page);
933         int active = PageActive(page);
934         enum lru_list lru = page_lru(page);
935
936         VM_BUG_ON(PageLRU(page));
937
938         SetPageLRU(page);
939         add_page_to_lru_list(page, lruvec, lru);
940         update_page_reclaim_stat(lruvec, file, active);
941         trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
942 }
943
944 /*
945  * Add the passed pages to the LRU, then drop the caller's refcount
946  * on them.  Reinitialises the caller's pagevec.
947  */
948 void __pagevec_lru_add(struct pagevec *pvec)
949 {
950         pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
951 }
952 EXPORT_SYMBOL(__pagevec_lru_add);
953
954 /**
955  * pagevec_lookup - gang pagecache lookup
956  * @pvec:       Where the resulting pages are placed
957  * @mapping:    The address_space to search
958  * @start:      The starting page index
959  * @nr_pages:   The maximum number of pages
960  *
961  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
962  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
963  * reference against the pages in @pvec.
964  *
965  * The search returns a group of mapping-contiguous pages with ascending
966  * indexes.  There may be holes in the indices due to not-present pages.
967  *
968  * pagevec_lookup() returns the number of pages which were found.
969  */
970 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
971                 pgoff_t start, unsigned nr_pages)
972 {
973         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
974         return pagevec_count(pvec);
975 }
976 EXPORT_SYMBOL(pagevec_lookup);
977
978 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
979                 pgoff_t *index, int tag, unsigned nr_pages)
980 {
981         pvec->nr = find_get_pages_tag(mapping, index, tag,
982                                         nr_pages, pvec->pages);
983         return pagevec_count(pvec);
984 }
985 EXPORT_SYMBOL(pagevec_lookup_tag);
986
987 /*
988  * Perform any setup for the swap system
989  */
990 void __init swap_setup(void)
991 {
992         unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
993 #ifdef CONFIG_SWAP
994         int i;
995
996         if (bdi_init(swapper_spaces[0].backing_dev_info))
997                 panic("Failed to init swap bdi");
998         for (i = 0; i < MAX_SWAPFILES; i++) {
999                 spin_lock_init(&swapper_spaces[i].tree_lock);
1000                 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
1001         }
1002 #endif
1003
1004         /* Use a smaller cluster for small-memory machines */
1005         if (megs < 16)
1006                 page_cluster = 2;
1007         else
1008                 page_cluster = 3;
1009         /*
1010          * Right now other parts of the system means that we
1011          * _really_ don't want to cluster much more
1012          */
1013 }