+++ /dev/null
-#
-# Feature name: pmdp_splitting_flush
-# Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-# description: arch supports the pmdp_splitting_flush() VM API
-#
- -----------------------
- | arch |status|
- -----------------------
- | alpha: | TODO |
- | arc: | TODO |
- | arm: | ok |
- | arm64: | ok |
- | avr32: | TODO |
- | blackfin: | TODO |
- | c6x: | TODO |
- | cris: | TODO |
- | frv: | TODO |
- | h8300: | TODO |
- | hexagon: | TODO |
- | ia64: | TODO |
- | m32r: | TODO |
- | m68k: | TODO |
- | metag: | TODO |
- | microblaze: | TODO |
- | mips: | ok |
- | mn10300: | TODO |
- | nios2: | TODO |
- | openrisc: | TODO |
- | parisc: | TODO |
- | powerpc: | ok |
- | s390: | ok |
- | score: | TODO |
- | sh: | TODO |
- | sparc: | TODO |
- | tile: | TODO |
- | um: | TODO |
- | unicore32: | TODO |
- | x86: | ok |
- | xtensa: | TODO |
- -----------------------
VmLib: 1412 kB
VmPTE: 20 kb
VmSwap: 0 kB
+ HugetlbPages: 0 kB
Threads: 1
SigQ: 0/28578
SigPnd: 0000000000000000
VmPTE size of page table entries
VmPMD size of second level page tables
VmSwap size of swap usage (the number of referred swapents)
+ HugetlbPages size of hugetlb memory portions
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread
Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
+AnonHugePages: 0 kB
+Shared_Hugetlb: 0 kB
+Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
KernelPageSize: 4 kB
"Swap" shows how much would-be-anonymous memory is also used, but out on
swap.
"SwapPss" shows proportional swap share of this mapping.
+"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
+hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
+reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
+
"VmFlags" field deserves a separate description. This member represents the kernel
flags associated with the particular virtual memory area in two letter encoded
manner. The codes are the following:
ac - area is accountable
nr - swap space is not reserved for the area
ht - area uses huge tlb pages
- nl - non-linear mapping
ar - architecture specific flag
dd - do not include area into core dump
sd - soft-dirty flag
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
-Kernel address sanitizer
-================
+KernelAddressSanitizer (KASAN)
+==============================
0. Overview
===========
-Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
+KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides
a fast and comprehensive solution for finding use-after-free and out-of-bounds
bugs.
-KASan uses compile-time instrumentation for checking every memory access,
-therefore you will need a gcc version of 4.9.2 or later. KASan could detect out
-of bounds accesses to stack or global variables, but only if gcc 5.0 or later was
-used to built the kernel.
+KASAN uses compile-time instrumentation for checking every memory access,
+therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
+required for detection of out-of-bounds accesses to stack or global variables.
-Currently KASan is supported only for x86_64 architecture and requires that the
-kernel be built with the SLUB allocator.
+Currently KASAN is supported only for x86_64 architecture and requires the
+kernel to be built with the SLUB allocator.
1. Usage
-=========
+========
To enable KASAN configure kernel with:
CONFIG_KASAN = y
-and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
-is compiler instrumentation types. The former produces smaller binary the
-latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version
-of 5.0 or later.
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
+inline are compiler instrumentation types. The former produces smaller binary
+the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
+version 5.0 or later.
Currently KASAN works only with the SLUB memory allocator.
-For better bug detection and nicer report, enable CONFIG_STACKTRACE and put
-at least 'slub_debug=U' in the boot cmdline.
+For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
To disable instrumentation for specific files or directories, add a line
similar to the following to the respective kernel Makefile:
KASAN_SANITIZE := n
1.1 Error reports
-==========
+=================
A typical out of bounds access report looks like this:
ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
-First sections describe slub object where bad access happened.
-See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
+The header of the report discribe what kind of bug happened and what kind of
+access caused it. It's followed by the description of the accessed slub object
+(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+the description of the accessed memory page.
In the last section the report shows memory state around the accessed address.
-Reading this part requires some more understanding of how KASAN works.
+Reading this part requires some understanding of how KASAN works.
-Each 8 bytes of memory are encoded in one shadow byte as accessible,
-partially accessible, freed or they can be part of a redzone.
+The state of each 8 aligned bytes of memory is encoded in one shadow byte.
+Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
We use the following encoding for each shadow byte: 0 means that all 8 bytes
of the corresponding memory region are accessible; number N (1 <= N <= 7) means
that the first N bytes are accessible, and other (8 - N) bytes are not;
2. Implementation details
-========================
+=========================
From a high level, our approach to memory error detection is similar to that
of kmemcheck: use shadow memory to record whether each byte of memory is safe
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
Default: 1024
+ hardlockup_all_cpu_backtrace=
+ [KNL] Should the hard-lockup detector generate
+ backtraces on all cpus.
+ Format: <integer>
+
hashdist= [KNL,NUMA] Large hashes allocated during boot
are distributed across NUMA nodes. Defaults on
for 64-bit NUMA, off otherwise.
details), without letting other interrupts have a chance to run.
Similarly to the softlockup case, the current stack trace is displayed
upon detection and the system will stay locked up unless the default
-behavior is changed, which can be done through a compile time knob,
-"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog"
+behavior is changed, which can be done through a sysctl,
+'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
+and a kernel parameter, "nmi_watchdog"
(see "Documentation/kernel-parameters.txt" for details).
The panic option can be used in combination with panic_timeout (this
Reminder: sizeof() result is of type size_t.
+The kernel's printf does not support %n. For obvious reasons, floating
+point formats (%e, %f, %g, %a) are also not recognized. Use of any
+unsupported specifier or length qualifier results in a WARN and early
+return from vsnprintf.
Raw pointer value SHOULD be printed with %p. The kernel supports
the following extended format specifiers for pointer types:
If field width is omitted the 1 byte only will be escaped.
Raw buffer as a hex string:
+
%*ph 00 01 02 ... 3f
%*phC 00:01:02: ... :3f
%*phD 00-01-02- ... -3f
Passed by reference.
dentry names:
+
%pd{,2,3,4}
%pD{,2,3,4}
va_list *va;
};
+ Implements a "recursive vsnprintf".
+
Do not use this feature without some mechanism to verify the
correctness of the format string and va_list arguments.
Passed by reference.
+Network device features:
+
+ %pNF 0x000000000000c000
+
+ For printing netdev_features_t.
+
+ Passed by reference.
+
+Command from struct task_struct
+
+ %pT ls
+
+ For printing executable name excluding path from struct
+ task_struct.
+
+ Passed by reference.
+
+If you add other %p extensions, please extend lib/test_printf.c with
+one or more test cases, if at all feasible.
+
+
Thank you for your cooperation and attention.
- domainname
- hostname
- hotplug
+- hardlockup_all_cpu_backtrace
- hung_task_panic
- hung_task_check_count
- hung_task_timeout_secs
domain names are in general different. For a detailed discussion
see the hostname(1) man page.
+==============================================================
+hardlockup_all_cpu_backtrace:
+
+This value controls the hard lockup detector behavior when a hard
+lockup condition is detected as to whether or not to gather further
+debug information. If enabled, arch-specific all-CPU stack dumping
+will be initiated.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
==============================================================
hotplug:
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-Memory balancing is needed for non __GFP_WAIT as well as for non
-__GFP_IO allocations.
+Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+well as for non __GFP_IO allocations.
-There are two reasons to be requesting non __GFP_WAIT allocations:
-the caller can not sleep (typically intr context), or does not want
-to incur cost overheads of page stealing and possible swap io for
-whatever reasons.
+The first reason why a caller may avoid reclaim is that the caller can not
+sleep due to holding a spinlock or is in interrupt context. The second may
+be that the caller is willing to fail the allocation without incurring the
+overhead of page reclaim. This may happen for opportunistic high-order
+allocation requests that have order-0 fallback options. In such cases,
+the caller may also wish to avoid waking kswapd.
__GFP_IO allocation requests are made to prevent file system deadlocks.
2. Insure that writeback is complete.
-3. Prep the new page that we want to move to. It is locked
- and set to not being uptodate so that all accesses to the new
- page immediately lock while the move is in progress.
+3. Lock the new page that we want to move to. It is locked so that accesses to
+ this (not yet uptodate) page immediately lock while the move is in progress.
-4. The new page is prepped with some settings from the old page so that
- accesses to the new page will discover a page with the correct settings.
-
-5. All the page table references to the page are converted
- to migration entries or dropped (nonlinear vmas).
- This decrease the mapcount of a page. If the resulting
- mapcount is not zero then we do not migrate the page.
- All user space processes that attempt to access the page
- will now wait on the page lock.
+4. All the page table references to the page are converted to migration
+ entries. This decreases the mapcount of a page. If the resulting
+ mapcount is not zero then we do not migrate the page. All user space
+ processes that attempt to access the page will now wait on the page lock.
-6. The radix tree lock is taken. This will cause all processes trying
+5. The radix tree lock is taken. This will cause all processes trying
to access the page via the mapping to block on the radix tree spinlock.
-7. The refcount of the page is examined and we back out if references remain
+6. The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.
-8. The radix tree is checked and if it does not contain the pointer to this
+7. The radix tree is checked and if it does not contain the pointer to this
page then we back out because someone else modified the radix tree.
+8. The new page is prepped with some settings from the old page so that
+ accesses to the new page will discover a page with the correct settings.
+
9. The radix tree is changed to point to the new page.
10. The reference count of the old page is dropped because the radix tree
which must be called on PTE table allocation / freeing.
Make sure the architecture doesn't use slab allocator for page table
-allocation: slab uses page->slab_cache and page->first_page for its pages.
-These fields share storage with page->ptl.
+allocation: slab uses page->slab_cache for its pages.
+This field shares storage with page->ptl.
PMD split lock only makes sense if you have more than two page table
levels.
== Design ==
-- "graceful fallback": mm components which don't have transparent
- hugepage knowledge fall back to breaking a transparent hugepage and
- working on the regular pages and their respective regular pmd/pte
- mappings
+- "graceful fallback": mm components which don't have transparent hugepage
+ knowledge fall back to breaking huge pmd mapping into table of ptes and,
+ if necessary, split a transparent hugepage. Therefore these components
+ can continue working on the regular pages or regular pte mappings.
- if a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
max_ptes_none can waste cpu time very little, you can
ignore it.
+max_ptes_swap specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
== Boot parameter ==
You can change the sysfs boot time defaults of Transparent Hugepage
of pages that should be collapsed into one huge page but failed
the allocation.
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.
+ This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed is is incremented if kernel fails to split huge
+ page. This can happen if the page was pinned by somebody.
+
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+ This can happen, for instance, when application calls mprotect() or
+ munmap() on part of huge page. It doesn't split huge page, only
+ page table entry.
thp_zero_page_alloc is incremented every time a huge zero page is
successfully allocated. It includes allocations which where
if any driver is going to mangle over the page structure of the tail
page (like for checking page->mapping or other bits that are relevant
for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
-split_huge_page() to avoid the head and tail pages to disappear from
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
NOTE: these aren't new constraints to the GUP API, and they match the
same constrains that applies to hugetlbfs too, so any driver capable
== Graceful fallback ==
Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
fallback design, with a one liner change, you can avoid to write
hundred if not thousand of lines of complex code to make your code
If you're not walking pagetables but you run into a physical hugepage
but you can't handle it natively in your code, you can split it by
calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
Example to make mremap.c transparent hugepage aware with a one liner
change:
return NULL;
pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(vma, addr, pmd);
++ split_huge_pmd(vma, pmd, addr);
if (pmd_none_or_clear_bad(pmd))
return NULL;
== Locking in hugepage aware code ==
We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
To make pagetable walks huge pmd aware, all you need to do is to call
pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
takes the mmap_sem in write mode in addition to the anon_vma lock). If
pmd_trans_huge returns false, you just fallback in the old code
paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
-case pmd_trans_splitting returns true, it means split_huge_page is
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
-
-== compound_lock, get_user_pages and put_page ==
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+== Refcounts and transparent huge pages ==
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+ - get_page()/put_page() and GUP operate in head page's ->_count.
+
+ - ->_count in tail pages is always zero: get_page_unless_zero() never
+ succeed on tail pages.
+
+ - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+ on relevant sub-page of the compound page.
+
+ - map/unmap of the whole compound page accounted in compound_mapcount
+ (stored in first tail page).
+
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
-page structures. It can do that easily for refcounts taken by huge pmd
-mappings. But the GUI API as created by hugetlbfs (that returns head
-and tail pages if running get_user_pages on an address backed by any
-hugepage), requires the refcount to be accounted on the tail pages and
-not only in the head pages, if we want to be able to run
-split_huge_page while there are gup pins established on any tail
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
-get_user_pages which is unacceptable as too many gup users are
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
-hugetlbfs pages cannot be split so there wouldn't be requirement of
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
-anymore which tail page is pinned by gup and which is not while we run
-split_huge_page. But we still have to add the gup pin to the head page
-too, to know when we can free the compound page in case it's never
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
-page (and only on a tail page) it will find its respective head page,
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
-refcount without race conditions, put_page has to serialize against
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_count and
+page->_mapcount.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
+from geting reference to tail page up to the point. After the atomic_add()
+we don't care about ->_count value. We already known how many references
+with should uncharge from head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+== Partial unmap and deferred_split_huge_page() ==
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
try_to_unmap() is always called, by either vmscan for reclaim or for page
migration, with the argument page locked and isolated from the LRU. Separate
-functions handle anonymous and mapped file pages, as these types of pages have
-different reverse map mechanisms.
-
- (*) try_to_unmap_anon()
-
- To unmap anonymous pages, each VMA in the list anchored in the anon_vma
- must be visited - at least until a VM_LOCKED VMA is encountered. If the
- page is being unmapped for migration, VM_LOCKED VMAs do not stop the
- process because mlocked pages are migratable. However, for reclaim, if
- the page is mapped into a VM_LOCKED VMA, the scan stops.
-
- try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of
- the mm_struct to which the VMA belongs. If this is successful, it will
- mlock the page via mlock_vma_page() - we wouldn't have gotten to
- try_to_unmap_anon() if the page were already mlocked - and will return
- SWAP_MLOCK, indicating that the page is unevictable.
-
- If the mmap semaphore cannot be acquired, we are not sure whether the page
- is really unevictable or not. In this case, try_to_unmap_anon() will
- return SWAP_AGAIN.
-
- (*) try_to_unmap_file() - linear mappings
-
- Unmapping of a mapped file page works the same as for anonymous mappings,
- except that the scan visits all VMAs that map the page's index/page offset
- in the page's mapping's reverse map priority search tree. It also visits
- each VMA in the page's mapping's non-linear list, if the list is
- non-empty.
-
- As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
- page, try_to_unmap_file() will attempt to acquire the associated
- mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
- is successful, and SWAP_AGAIN, if not.
-
- (*) try_to_unmap_file() - non-linear mappings
-
- If a page's mapping contains a non-empty non-linear mapping VMA list, then
- try_to_un{map|lock}() must also visit each VMA in that list to determine
- whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
- all VMAs in the non-linear list to ensure that the pages is not/should not
- be mlocked.
-
- If a VM_LOCKED VMA is found in the list, the scan could terminate.
- However, there is no easy way to determine whether the page is actually
- mapped in a given VMA - either for unmapping or testing whether the
- VM_LOCKED VMA actually pins the page.
-
- try_to_unmap_file() handles non-linear mappings by scanning a certain
- number of pages - a "cluster" - in each non-linear VMA associated with the
- page's mapping, for each file mapped page that vmscan tries to unmap. If
- this happens to unmap the page we're trying to unmap, try_to_unmap() will
- notice this on return (page_mapcount(page) will be 0) and return
- SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
- recirculate this page. We take advantage of the cluster scan in
- try_to_unmap_cluster() as follows:
-
- For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
- mmap semaphore of the associated mm_struct for read without blocking.
-
- If this attempt is successful and the VMA is VM_LOCKED,
- try_to_unmap_cluster() will retain the mmap semaphore for the scan;
- otherwise it drops it here.
-
- Then, for each page in the cluster, if we're holding the mmap semaphore
- for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
- mlock the page. This call is a no-op if the page is already locked,
- but will mlock any pages in the non-linear mapping that happen to be
- unlocked.
-
- If one of the pages so mlocked is the page passed in to try_to_unmap(),
- try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
- SWAP_AGAIN. This will allow vmscan to cull the page, rather than
- recirculating it on the inactive list.
-
- Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
- returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
- VMA, but couldn't be mlocked.
+functions handle anonymous and mapped file and KSM pages, as these types of
+pages have different reverse map lookup mechanisms, with different locking.
+In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
+it will call try_to_unmap_one() for every VMA which might contain the page.
+
+When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
+VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
+and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
+stops there.
+
+mlock_vma_page() is called while holding the page table's lock (in addition
+to the page lock, and the rmap lock): to serialize against concurrent mlock or
+munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
+holepunching, and truncation of file pages and their anonymous COWed pages.
try_to_munlock() REVERSE MAP SCAN
introduced a variant of try_to_unmap() called try_to_munlock().
try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
-mapped file pages with an additional argument specifying unlock versus unmap
+mapped file and KSM pages with a flag argument specifying unlock versus unmap
processing. Again, these functions walk the respective reverse maps looking
-for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file
-pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
-attempt to acquire the associated mmap semaphore, mlock the page via
-mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
-pre-clearing of the page's PG_mlocked done by munlock_vma_page.
-
-If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
-semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
-recycle the page on the inactive list and hope that it has better luck with the
-page next time.
-
-For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
-slightly differently. On encountering a VM_LOCKED non-linear VMA that might
-map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
-page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
-with it - the usual fallback position.
+for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case,
+the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This
+undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
-However, the scan can terminate when it encounters a VM_LOCKED VMA and can
-successfully acquire the VMA's mmap semaphore for read and mlock the page.
+However, the scan can terminate when it encounters a VM_LOCKED VMA.
Although try_to_munlock() might be called a great many times when munlocking a
large region or tearing down a large address space that has been mlocked via
mlockall(), overall this is a fairly rare event.
(3) mlocked pages that could not be isolated from the LRU and moved to the
unevictable list in mlock_vma_page().
- (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
- acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
- munlock_vma_page() was forced to let the page back on to the normal LRU
- list for vmscan to handle.
-
shrink_inactive_list() also diverts any unevictable pages that it finds on the
inactive lists to the appropriate zone's unevictable list.
F: Documentation/filesystems/nilfs2.txt
F: fs/nilfs2/
F: include/linux/nilfs2_fs.h
+F: include/trace/events/nilfs2.h
NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
M: YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>
#define MCL_CURRENT 8192 /* lock all currently mapped pages */
#define MCL_FUTURE 16384 /* lock all additions to address space */
+#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_SPACEAVAIL 5 /* ensure resources are available */
#define MADV_DONTNEED 6 /* don't need these pages */
+#define MADV_FREE 7 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
*/
if (!mapping_mapped(mapping)) {
clear_bit(PG_dc_clean, &page->flags);
- } else if (page_mapped(page)) {
+ } else if (page_mapcount(page)) {
/* kernel reading from page with U-mapping */
phys_addr_t paddr = (unsigned long)page_address(page);
* For !VIPT cache, all of this gets compiled out as
* addr_not_cache_congruent() is 0
*/
- if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+ if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
__flush_dcache_page((unsigned long)kfrom, u_vaddr);
clean_src_k_mappings = 1;
}
#define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0)
#define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING (_AT(pmdval_t, 1) << 56)
#define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57)
#define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif
#endif
#define PMD_BIT_FUNC(fn,op) \
PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
*
* Lock the page table for the destination and check
* to see that it's still huge and whether or not we will
- * need to fault on write, or if we have a splitting THP.
+ * need to fault on write.
*/
if (unlikely(pmd_thp_or_huge(*pmd))) {
ptl = ¤t->mm->page_table_lock;
spin_lock(ptl);
if (unlikely(!pmd_thp_or_huge(*pmd)
- || pmd_hugewillfault(*pmd)
- || pmd_trans_splitting(*pmd))) {
+ || pmd_hugewillfault(*pmd))) {
spin_unlock(ptl);
return 0;
}
}
}
} else {
- fault = probe_kernel_address(instrptr, instr);
+ fault = probe_kernel_address((void *)instrptr, instr);
instr = __mem_to_opcode_arm(instr);
}
if (nommu())
addr = __alloc_simple_buffer(dev, size, gfp, &page);
- else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT))
+ else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM))
addr = __alloc_from_contiguous(dev, size, prot, &page,
caller, want_vaddr);
else if (is_coherent)
addr = __alloc_simple_buffer(dev, size, gfp, &page);
- else if (!(gfp & __GFP_WAIT))
+ else if (!gfpflags_allow_blocking(gfp))
addr = __alloc_from_pool(size, &page);
else
addr = __alloc_remap_buffer(dev, size, gfp, prot, &page,
*handle = DMA_ERROR_CODE;
size = PAGE_ALIGN(size);
- if (!(gfp & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(gfp))
return __iommu_alloc_atomic(dev, size, handle);
/*
mapping = page_mapping(page);
if (!cache_ops_need_broadcast() &&
- mapping && !page_mapped(page))
+ mapping && !page_mapcount(page))
clear_bit(PG_dcache_clean, &page->flags);
else {
__flush_dcache_page(mapping, page);
*/
__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
unsigned long xen_get_swiotlb_free_pages(unsigned int order)
{
struct memblock_region *reg;
- gfp_t flags = __GFP_NOWARN;
+ gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
for_each_memblock(memory, reg) {
if (reg->base < (phys_addr_t)0xffffffff) {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
dev->coherent_dma_mask <= DMA_BIT_MASK(32))
flags |= GFP_DMA;
- if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
+ if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) {
struct page *page;
void *addr;
size = PAGE_ALIGN(size);
- if (!coherent && !(flags & __GFP_WAIT)) {
+ if (!coherent && !gfpflags_allow_blocking(flags)) {
struct page *page = NULL;
void *addr = __alloc_from_pool(size, &page, flags);
*/
gfp |= __GFP_ZERO;
- if (gfp & __GFP_WAIT) {
+ if (gfpflags_allow_blocking(gfp)) {
struct page **pages;
pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
* Additional functions defined in assembly.
*/
EXPORT_SYMBOL(flush_icache_range);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
-
- VM_BUG_ON(address & ~PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
- /* dummy IPI to serialise against fast_gup */
- kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/* Huge TLB page */
#define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING (1 << _PAGE_SPLITTING_SHIFT)
#endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
/* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
-#define _PAGE_NO_EXEC_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#ifdef _PAGE_HUGE_SHIFT
+#define _PAGE_NO_EXEC_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
#if defined(_PAGE_NO_READ_SHIFT)
#define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
-#define _PAGE_GLOBAL_SHIFT (_PAGE_SPLITTING_SHIFT + 1)
+#elif defined(_PAGE_HUGE_SHIFT)
+#define _PAGE_GLOBAL_SHIFT (_PAGE_HUGE_SHIFT + 1)
#else
#define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1)
#endif
return pmd;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
-
- return pmd;
-}
-
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
* another ASID than the current one.
*/
map_coherent = (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page));
+ page_mapcount(page) &&
+ !Page_dcache_dirty(page));
if (map_coherent)
vaddr = kmap_coherent(page, addr);
else
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (page_mapped(page) && !Page_dcache_dirty(page)) {
+ if (page_mapcount(page) && !Page_dcache_dirty(page)) {
void *kaddr;
kaddr = kmap_coherent(page, vmaddr);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_huge(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
vto = kmap_atomic(to);
if (cpu_has_dc_aliases &&
- page_mapped(from) && !Page_dcache_dirty(from)) {
+ page_mapcount(from) && !Page_dcache_dirty(from)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
kunmap_coherent();
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
kunmap_coherent();
unsigned long len)
{
if (cpu_has_dc_aliases &&
- page_mapped(page) && !Page_dcache_dirty(page)) {
+ page_mapcount(page) && !Page_dcache_dirty(page)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
kunmap_coherent();
}
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
-{
- if (!pmd_trans_splitting(*pmdp)) {
- pmd_t pmd = pmd_mksplitting(*pmdp);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- }
-}
-
-#endif
-
pmd_t mk_pmd(struct page *page, pgprot_t prot)
{
pmd_t pmd;
pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
- pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
#endif
#ifdef CONFIG_CPU_MIPSR2
if (cpu_has_rixi) {
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
+#define MADV_FREE 8 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
void pgtable_cache_init(void);
#endif /* __ASSEMBLY__ */
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
/*
* We need to differentiate between explicit huge page and THP huge
* page, since THP huge page also need to track real subpage details
/*
* set of bits not changed in pmd_modify.
*/
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \
- _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
- _PAGE_THP_HUGE)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+ _PAGE_ACCESSED | _PAGE_THP_HUGE)
#ifndef __ASSEMBLY__
/*
return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- if (pmd_trans_huge(pmd))
- return pmd_val(pmd) & _PAGE_SPLITTING;
- return 0;
-}
-
extern int has_transparent_hugepage(void);
#else
static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
return pmd;
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- pmd_val(pmd) |= _PAGE_SPLITTING;
- return pmd;
-}
-
#define __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#define pmdp_collapse_flush pmdp_collapse_flush
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
- /* If PMD is trans splitting retry the access */
- if (unlikely(old_pmd & _PAGE_SPLITTING))
- return 0;
/* If PMD permissions don't match, take page fault */
if (unlikely(access & ~old_pmd))
return 1;
/*
* A hugepage collapse is captured by pmd_none, because
* it mark the pmd none and do a hpte invalidate.
- *
- * We don't worry about pmd_trans_splitting here, The
- * caller if it needs to handle the splitting case
- * should check for that.
*/
if (pmd_none(pmd))
return NULL;
{
unsigned long mask;
unsigned long pte_end;
- struct page *head, *page, *tail;
+ struct page *head, *page;
pte_t pte;
int refs;
head = pte_page(pte);
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
setup_nr_node_ids();
/* allocate the map */
- for (node = 0; node < nr_node_ids; node++)
+ for_each_node(node)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
-/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- unsigned long old, tmp;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- andi. %1,%0,%6\n\
- bne- 1b \n\
- ori %1,%0,%4 \n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
- : "cc" );
-#else
- old = pmd_val(*pmdp);
- *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
- /*
- * If we didn't had the splitting flag set, go and flush the
- * HPTE entries.
- */
- trace_hugepage_splitting(address, old);
- if (!(old & _PAGE_SPLITTING)) {
- /* We need to flush the hpte */
- if (old & _PAGE_HASHPTE)
- hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
- }
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- */
- kick_all_cpus_sync();
-}
-
/*
* We want to put the pgtable in pmd and use pgtable for tracking
* the base page size hptes
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
return 0;
}
ret = get_user(regs->nip, &inst);
pagefault_enable();
} else {
- ret = probe_kernel_address(regs->nip, inst);
+ ret = probe_kernel_address((void *)regs->nip, inst);
}
if (!ret && mcheck_handle_load(regs, inst)) {
#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT 0x0800 /* THP splitting bit */
#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */
#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */
* SW-bits: y young, d dirty, r read, w write
*/
-#define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */
-
/* Page status table bits for virtualization */
#define PGSTE_ACC_BITS 0xf000000000000000UL
#define PGSTE_FP_BIT 0x0800000000000000UL
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
if (pmd_large(pmd)) {
pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
- _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
- _SEGMENT_ENTRY_SOFT_DIRTY;
+ _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
pmd_val(pmd) |= massage_pgprot_pmd(newprot);
if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
#define __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
- (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
-
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask, result;
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
return 0;
}
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
pmd = *pmdp;
barrier();
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush() has to serialize with
- * smp_call_function() against our disabled IRQs, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
/*
return 1;
}
-static void pmdp_splitting_flush_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
- (unsigned long *) pmdp)) {
- /* need to serialize against gup-fast (IRQ disabled) */
- smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
- }
-}
-
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
/* Sign extend */
regcache[dest] =
- ((((s64)(u64)op >> 10) & 0xffff) << 54) >> 54;
+ sign_extend64((((u64)op >> 10) & 0xffff), 9);
break;
case (0xd0 >> 2): /* addi */
case (0xd4 >> 2): /* addi.l */
if (displacement_not_indexed) {
__s64 displacement;
displacement = (opcode >> 10) & 0x3ff;
- displacement = ((displacement << 54) >> 54); /* sign extend */
+ displacement = sign_extend64(displacement, 9);
addr = (__u64)((__s64)base_address + (displacement << width_shift));
} else {
__u64 offset;
*/
map_coherent = (current_cpu_data.dcache.n_aliases &&
test_bit(PG_dcache_clean, &page->flags) &&
- page_mapped(page));
+ page_mapcount(page));
if (map_coherent)
vaddr = kmap_coherent(page, address);
else
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(vto, src, len);
unsigned long vaddr, void *dst, const void *src,
unsigned long len)
{
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
vto = kmap_atomic(to);
- if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
test_bit(PG_dcache_clean, &from->flags)) {
vfrom = kmap_coherent(from, vaddr);
copy_page(vto, vfrom);
unsigned long addr = (unsigned long) page_address(page);
if (pages_do_alias(addr, vmaddr)) {
- if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+ if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
test_bit(PG_dcache_clean, &page->flags)) {
void *kaddr;
return pte_val(pte) & _PAGE_PMD_HUGE;
}
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
- pte_t pte = __pte(pmd_val(pmd));
-
- return pmd_trans_huge(pmd) && pte_special(pte);
-}
-
#define has_transparent_hugepage() 1
static inline pmd_t pmd_mkold(pmd_t pmd)
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
+static inline pmd_t pmd_mkclean(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
- pte = pte_mkyoung(pte);
+ pte = pte_mkclean(pte);
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
- pte = pte_mkwrite(pte);
+ pte = pte_mkyoung(pte);
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
- pte = pte_mkspecial(pte);
+ pte = pte_mkwrite(pte);
return __pmd(pte_val(pte));
}
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
+#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmdp)) {
- if (pmd_trans_splitting(*pmdp))
- goto out_irq_enable;
-
pa = pmd_pfn(*pmdp) << PAGE_SHIFT;
pa += tpc & ~HPAGE_MASK;
put_page(head);
return 0;
}
- if (head != page)
- get_huge_page_tail(page);
pages[*nr] = page;
(*nr)++;
unsigned long end, int write, struct page **pages,
int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (!(pmd_val(pmd) & _PAGE_VALID))
refs = 0;
head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
return 0;
}
- /* Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmdp, pmd, addr, next,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#define pmd_trans_huge pmd_huge_page
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
- return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return hv_pte_get_client2(pmd_pte(pmd));
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
#endif /* _ASM_TILE_MMAN_H */
def_bool y
depends on NUMA
+config HAVE_MEMORYLESS_NODES
+ def_bool NUMA
+
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
def_bool y
depends on X86_64 || X86_PAE
# Changed by many, many contributors over the years.
#
+KASAN_SANITIZE := n
+
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
-KASAN_SANITIZE := n
-
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
373 i386 shutdown sys_shutdown
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
+376 i386 mlock2 sys_mlock2
322 64 execveat stub_execveat
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
+325 common mlock2 sys_mlock2
#
# x32-specific system call numbers start at 512 to avoid cache impact
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
static inline int pmd_trans_huge(pmd_t pmd)
{
return pmd_val(pmd) & _PAGE_PSE;
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
nid = acpi_get_node(handle);
if (nid != -1) {
+ if (try_online_node(nid)) {
+ pr_warn("failed to online node%d for CPU%d, use node%d instead.\n",
+ nid, cpu, first_node(node_online_map));
+ nid = first_node(node_online_map);
+ }
set_apicid_to_node(physid, nid);
numa_set_node(cpu, nid);
+ set_cpu_numa_mem(cpu, local_memory_node(nid));
}
#endif
}
{
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+ set_cpu_numa_mem(cpu, NUMA_NO_NODE);
#endif
- per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
set_cpu_present(cpu, false);
num_processors--;
goto again;
delta = now - prev;
- if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
- delta <<= 32;
- delta >>= 32; /* sign extend */
- }
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+ delta = sign_extend64(delta, 31);
+
local64_add(now - prev, &event->count);
}
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
- if (flag & __GFP_WAIT) {
+ if (gfpflags_allow_blocking(flag)) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
if (page && page_to_phys(page) + size > dma_mask) {
dma_release_from_contiguous(dev, page, count);
*/
phys_id = read_apic_id();
+ set_numa_mem(local_memory_node(cpu_to_node(cpuid)));
+
/*
* the boot CPU has finished the init stage and is spinning
* on callin_map until we finish. We are free to set up this
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+ if (pmd_trans_huge(*pmd)) {
+ struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
__flush_tlb_all();
init_task.kasan_depth = 0;
- pr_info("Kernel address sanitizer initialized\n");
+ pr_info("KernelAddressSanitizer initialized\n");
}
int __initdata numa_off;
nodemask_t numa_nodes_parsed __initdata;
+static nodemask_t numa_nodes_empty __initdata;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
end = max(mi->blk[i].end, end);
}
- if (start >= end)
- continue;
-
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
- if (end && (end - start) < NODE_MIN_SIZE)
- continue;
-
- alloc_node_data(nid);
+ if (start < end && (end - start) >= NODE_MIN_SIZE) {
+ alloc_node_data(nid);
+ } else if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES)) {
+ alloc_node_data(nid);
+ node_set(nid, numa_nodes_empty);
+ }
}
/* Dump memblock with node info and return. */
*/
static void __init numa_init_array(void)
{
- int rr, i;
+ int i, rr = MAX_NUMNODES;
- rr = first_node(node_online_map);
for (i = 0; i < nr_cpu_ids; i++) {
- if (early_cpu_to_node(i) != NUMA_NO_NODE)
- continue;
+ /* Search for an onlined node with memory */
+ do {
+ if (rr != MAX_NUMNODES)
+ rr = next_node(rr, node_online_map);
+ if (rr == MAX_NUMNODES)
+ rr = first_node(node_online_map);
+ } while (node_isset(rr, numa_nodes_empty));
+
numa_set_node(i, rr);
- rr = next_node(rr, node_online_map);
- if (rr == MAX_NUMNODES)
- rr = first_node(node_online_map);
}
}
if (ret < 0)
return ret;
- for (i = 0; i < nr_cpu_ids; i++) {
- int nid = early_cpu_to_node(i);
-
- if (nid == NUMA_NO_NODE)
- continue;
- if (!node_online(nid))
- numa_clear_node(i);
- }
numa_init_array();
return 0;
{
int n, val;
int min_val = INT_MAX;
- int best_node = -1;
+ int best_node = NUMA_NO_NODE;
for_each_online_node(n) {
+ if (node_isset(n, numa_nodes_empty))
+ continue;
+
val = node_distance(node, n);
if (val < min_val) {
if (!node_online(node))
node = find_near_online_node(node);
numa_set_node(cpu, node);
+ if (node_spanned_pages(node))
+ set_cpu_numa_mem(cpu, node);
+ if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES))
+ node_clear(node, numa_nodes_empty);
+ }
+
+ /* Destroy empty nodes */
+ if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES)) {
+ int nid;
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+
+ for_each_node_mask(nid, numa_nodes_empty) {
+ node_set_offline(nid);
+ memblock_free(__pa(node_data[nid]), nd_size);
+ node_data[nid] = NULL;
+ }
}
}
return young;
}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- int set;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
- (unsigned long *)pmdp);
- if (set) {
- pmd_update(vma->vm_mm, address, pmdp);
- /* need tlb flush only to serialize against gup-fast */
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
-}
#endif
/**
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
*/
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
#define MADV_NORMAL 0 /* no further special treatment */
#define MADV_RANDOM 1 /* expect random page references */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
page_mapcount(p));
if (!page_count(p))
rc |= TLB_INSANE;
- else if (page_mapped(p))
+ else if (page_mapcount(p))
rc |= TLB_SUSPICIOUS;
} else {
rc |= TLB_INSANE;
bvl = mempool_alloc(pool, gfp_mask);
} else {
struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+ gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
/*
* Make this allocation restricted and don't dump info on
__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
/*
- * Try a slab allocation. If this fails and __GFP_WAIT
+ * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
* is set, retry with the 1-entry mempool
*/
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+ if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
*idx = BIOVEC_MAX_IDX;
goto fallback;
}
* If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
* backed by the @bs's mempool.
*
- * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- * able to allocate a bio. This is due to the mempool guarantees. To make this
- * work, callers must never allocate more than 1 bio at a time from this pool.
- * Callers that need to allocate more than 1 bio must always submit the
- * previously allocated bio for IO before attempting to allocate a new one.
- * Failure to do so can cause deadlocks under memory pressure.
+ * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
+ * always be able to allocate a bio. This is due to the mempool guarantees.
+ * To make this work, callers must never allocate more than 1 bio at a time
+ * from this pool. Callers that need to allocate more than 1 bio must always
+ * submit the previously allocated bio for IO before attempting to allocate
+ * a new one. Failure to do so can cause deadlocks under memory pressure.
*
* Note that when running under generic_make_request() (i.e. any block
* driver), bios are not submitted until after you return - see the code in
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are
* bios on current->bio_list, we first try the allocation
- * without __GFP_WAIT; if that fails, we punt those bios we
- * would be blocking to the rescuer workqueue before we retry
- * with the original gfp_flags.
+ * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
+ * bios we would be blocking to the rescuer workqueue before
+ * we retry with the original gfp_flags.
*/
if (current->bio_list && !bio_list_empty(current->bio_list))
- gfp_mask &= ~__GFP_WAIT;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
p = mempool_alloc(bs->bio_pool, gfp_mask);
if (!p && gfp_mask != saved_gfp) {
if (percpu_ref_tryget_live(&q->q_usage_counter))
return 0;
- if (!(gfp & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(gfp))
return -EBUSY;
ret = wait_event_interruptible(q->mq_freeze_wq,
* @bio: bio to allocate request for (can be %NULL)
* @gfp_mask: allocation mask
*
- * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
- * function keeps retrying under memory pressure and fails iff @q is dead.
+ * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * this function keeps retrying under memory pressure and fails iff @q is dead.
*
* Must be called with @q->queue_lock held and,
* Returns ERR_PTR on failure, with @q->queue_lock held.
if (!IS_ERR(rq))
return rq;
- if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+ if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
return rq;
}
* BUG.
*
* WARNING: When allocating/cloning a bio-chain, careful consideration should be
- * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
- * anything but the first bio in the chain. Otherwise you risk waiting for IO
- * completion of a bio that hasn't been submitted yet, thus resulting in a
- * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
- * of bio_alloc(), as that avoids the mempool deadlock.
+ * given to how you allocate bios. In particular, you cannot use
+ * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+ * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+ * thus resulting in a deadlock. Alternatively bios should be allocated using
+ * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
* If possible a big IO should be split into smaller parts when allocation
* fails. Partial allocation should not be an error, or you risk a live-lock.
*/
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) {
+ if (likely(blk_queue_enter(q, ___GFP_DIRECT_RECLAIM) == 0)) {
q->make_request_fn(q, bio);
{
struct io_context *ioc;
- might_sleep_if(gfp_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_flags));
do {
task_lock(task);
if (tag != -1)
return tag;
- if (!(data->gfp & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(data->gfp))
return -1;
bs = bt_wait_ptr(bt, hctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+ blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
reserved, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq && (gfp & __GFP_WAIT)) {
+ if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q,
- __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+ __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
hctx = alloc_data.hctx;
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
break;
do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), uid))
+ if (!uid_eq(task_uid(p), uid) ||
+ !task_pid_vnr(p))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), user->uid))
+ if (!uid_eq(task_uid(p), user->uid) ||
+ !task_pid_vnr(p))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
}
- rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+ rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto error_free_buffer;
break;
}
- if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
+ if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
err = DRIVER_ERROR << 24;
goto error;
}
struct request *rq;
int err;
- rq = blk_get_request(q, WRITE, __GFP_WAIT);
+ rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
blk_rq_set_block_pc(rq);
bm_set_page_unchanged(b->bm_pages[page_nr]);
if (ctx->flags & BM_AIO_COPY_PAGES) {
- page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+ page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
copy_highpage(page, b->bm_pages[page_nr]);
bm_store_page_idx(page, page_nr);
} else
}
if (has_payload && data_size) {
- page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+ page = drbd_alloc_pages(peer_device, nr_pages,
+ gfpflags_allow_blocking(gfp_mask));
if (!page)
goto fail;
}
{
struct request *rq;
- rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true);
+ rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
return blk_mq_rq_to_pdu(rq);
}
spin_unlock_irqrestore(&nbd->tasks_lock, flags);
if (signal_pending(current)) {
- siginfo_t info;
-
- ret = dequeue_signal_lock(current, ¤t->blocked, &info);
+ ret = kernel_dequeue_signal(NULL);
dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
task_pid_nr(current), current->comm, ret);
mutex_lock(&nbd->tx_lock);
!list_empty(&nbd->waiting_queue));
if (signal_pending(current)) {
- siginfo_t info;
- int ret;
+ int ret = kernel_dequeue_signal(NULL);
- ret = dequeue_signal_lock(current, ¤t->blocked,
- &info);
dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
task_pid_nr(current), current->comm, ret);
mutex_lock(&nbd->tx_lock);
spin_unlock_irqrestore(&nbd->tasks_lock, flags);
/* Clear maybe pending signals */
- if (signal_pending(current)) {
- siginfo_t info;
- dequeue_signal_lock(current, ¤t->blocked, &info);
- }
+ if (signal_pending(current))
+ kernel_dequeue_signal(NULL);
return 0;
}
goto err_out;
tmp->bi_bdev = NULL;
- gfpmask &= ~__GFP_WAIT;
+ gfpmask &= ~__GFP_DIRECT_RECLAIM;
tmp->bi_next = NULL;
if (!new_chain)
struct request *rq;
int err = 0;
- rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
int ret = 0;
rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
- WRITE : READ, __GFP_WAIT);
+ WRITE : READ, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
blk_rq_set_block_pc(rq);
if (cgc->buflen) {
ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
- __GFP_WAIT);
+ __GFP_RECLAIM);
if (ret)
goto out;
}
meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
}
-static inline int is_partial_io(struct bio_vec *bvec)
+static inline bool is_partial_io(struct bio_vec *bvec)
{
return bvec->bv_len != PAGE_SIZE;
}
/*
* Check if request is within bounds and aligned on zram logical blocks.
*/
-static inline int valid_io_request(struct zram *zram,
+static inline bool valid_io_request(struct zram *zram,
sector_t start, unsigned int size)
{
u64 end, bound;
/* unaligned request */
if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
- return 0;
+ return false;
if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
- return 0;
+ return false;
end = start + (size >> SECTOR_SHIFT);
bound = zram->disksize >> SECTOR_SHIFT;
/* out of range range */
if (unlikely(start >= bound || end > bound || start > end))
- return 0;
+ return false;
/* I/O request is valid */
- return 1;
+ return true;
}
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
} while (old_max != cur_max);
}
-static int page_zero_filled(void *ptr)
+static bool page_zero_filled(void *ptr)
{
unsigned int pos;
unsigned long *page;
for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
if (page[pos])
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static void handle_zero_page(struct bio_vec *bvec)
struct zram *zram = dev_to_zram(dev);
size_t sz;
+ if (!zcomp_available_algorithm(buf))
+ return -EINVAL;
+
down_write(&zram->init_lock);
if (init_done(zram)) {
up_write(&zram->init_lock);
if (sz > 0 && zram->compressor[sz - 1] == '\n')
zram->compressor[sz - 1] = 0x00;
- if (!zcomp_available_algorithm(zram->compressor))
- len = -EINVAL;
-
up_write(&zram->init_lock);
return len;
}
}
alloced_pages = zs_get_total_pages(meta->mem_pool);
+ update_used_max(zram, alloced_pages);
+
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
zs_free(meta->mem_pool, handle);
ret = -ENOMEM;
goto out;
}
- update_used_max(zram, alloced_pages);
-
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
if (group)
return netlink_broadcast(dev->nls, skb, portid, group,
gfp_mask);
- return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT));
+ return netlink_unicast(dev->nls, skb, portid,
+ !gfpflags_allow_blocking(gfp_mask));
}
EXPORT_SYMBOL_GPL(cn_netlink_send_mult);
static int add_client_resource(struct client *client,
struct client_resource *resource, gfp_t gfp_mask)
{
- bool preload = !!(gfp_mask & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp_mask);
unsigned long flags;
int ret;
* __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping)
* so shmem can relocate pages during swapin if required.
*/
- BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) &&
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) &&
(page_to_pfn(p) >= 0x00100000UL));
}
#include "drm_legacy.h"
#include "drm_internal.h"
-static int drm_notifier(void *priv);
-
static int drm_lock_take(struct drm_lock_data *lock_data, unsigned int context);
/**
* really probably not the correct answer but lets us debug xkb
* xserver for now */
if (!file_priv->is_master) {
- sigemptyset(&dev->sigmask);
- sigaddset(&dev->sigmask, SIGSTOP);
- sigaddset(&dev->sigmask, SIGTSTP);
- sigaddset(&dev->sigmask, SIGTTIN);
- sigaddset(&dev->sigmask, SIGTTOU);
dev->sigdata.context = lock->context;
dev->sigdata.lock = master->lock.hw_lock;
- block_all_signals(drm_notifier, dev, &dev->sigmask);
}
if (dev->driver->dma_quiescent && (lock->flags & _DRM_LOCK_QUIESCENT))
/* FIXME: Should really bail out here. */
}
- unblock_all_signals();
return 0;
}
return 0;
}
-/**
- * If we get here, it means that the process has called DRM_IOCTL_LOCK
- * without calling DRM_IOCTL_UNLOCK.
- *
- * If the lock is not held, then let the signal proceed as usual. If the lock
- * is held, then set the contended flag and keep the signal blocked.
- *
- * \param priv pointer to a drm_device structure.
- * \return one if the signal should be delivered normally, or zero if the
- * signal should be blocked.
- */
-static int drm_notifier(void *priv)
-{
- struct drm_device *dev = priv;
- struct drm_hw_lock *lock = dev->sigdata.lock;
- unsigned int old, new, prev;
-
- /* Allow signal delivery if lock isn't held */
- if (!lock || !_DRM_LOCK_IS_HELD(lock->lock)
- || _DRM_LOCKING_CONTEXT(lock->lock) != dev->sigdata.context)
- return 1;
-
- /* Otherwise, set flag to force call to
- drmUnlock */
- do {
- old = lock->lock;
- new = old | _DRM_LOCK_CONT;
- prev = cmpxchg(&lock->lock, old, new);
- } while (prev != old);
- return 0;
-}
-
/**
* This function returns immediately and takes the hw lock
* with the kernel context if it is free, otherwise it gets the highest priority when and if
* Fail silently without starting the shrinker
*/
mapping = file_inode(obj->base.filp)->i_mapping;
- gfp = mapping_gfp_mask(mapping);
- gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
- gfp &= ~(__GFP_IO | __GFP_WAIT);
+ gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM));
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
sg = st->sgl;
st->nents = 0;
for (i = 0; i < page_count; i++) {
struct request *rq;
int error;
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_DRV_PRIV;
rq->special = (char *)pc;
struct request *rq;
int error;
- rq = blk_get_request(drive->queue, write, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, write, __GFP_RECLAIM);
memcpy(rq->cmd, cmd, BLK_MAX_CDB);
rq->cmd_type = REQ_TYPE_ATA_PC;
struct request *rq;
int ret;
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_DRV_PRIV;
rq->cmd_flags = REQ_QUIET;
ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
if (!(setting->flags & DS_SYNC))
return setting->set(drive, arg);
- rq = blk_get_request(q, READ, __GFP_WAIT);
+ rq = blk_get_request(q, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_DRV_PRIV;
rq->cmd_len = 5;
rq->cmd[0] = REQ_DEVSET_EXEC;
if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
return -EBUSY;
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
drive->mult_req = arg;
if (NULL == (void *) arg) {
struct request *rq;
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
err = blk_execute_rq(drive->queue, NULL, rq, 0);
blk_put_request(rq);
struct request *rq;
int ret = 0;
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_DRV_PRIV;
rq->cmd_len = 1;
rq->cmd[0] = REQ_DRIVE_RESET;
}
spin_unlock_irq(&hwif->lock);
- rq = blk_get_request(q, READ, __GFP_WAIT);
+ rq = blk_get_request(q, READ, __GFP_RECLAIM);
rq->cmd[0] = REQ_PARK_HEADS;
rq->cmd_len = 1;
rq->cmd_type = REQ_TYPE_DRV_PRIV;
}
memset(&rqpm, 0, sizeof(rqpm));
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
rq->special = &rqpm;
rqpm.pm_step = IDE_PM_START_SUSPEND;
}
memset(&rqpm, 0, sizeof(rqpm));
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
rq->cmd_flags |= REQ_PREEMPT;
rq->special = &rqpm;
BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
BUG_ON(size < 0 || size % tape->blk_size);
- rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_DRV_PRIV;
rq->cmd[13] = cmd;
rq->rq_disk = tape->disk;
if (size) {
ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
- __GFP_WAIT);
+ __GFP_RECLAIM);
if (ret)
goto out_put;
}
int error;
int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE;
- rq = blk_get_request(drive->queue, rw, __GFP_WAIT);
+ rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM);
rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
/*
*/
if (nsect) {
error = blk_rq_map_kern(drive->queue, rq, buf,
- nsect * SECTOR_SIZE, __GFP_WAIT);
+ nsect * SECTOR_SIZE, __GFP_RECLAIM);
if (error)
goto put_req;
}
static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
{
- bool preload = !!(gfp_mask & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp_mask);
unsigned long flags;
int ret, id;
* heavy filesystem activity makes these fail, and we can
* use compound pages.
*/
- gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+ gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
egrcnt = rcd->rcvegrcnt;
egroff = rcd->rcvegr_tid_base;
page = alloc_pages(flag | __GFP_NOWARN, get_order(size));
if (!page) {
- if (!(flag & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(flag))
return NULL;
page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
flags |= GFP_DMA32;
}
- if (flags & __GFP_WAIT) {
+ if (gfpflags_allow_blocking(flags)) {
unsigned int count = size >> PAGE_SHIFT;
page = dma_alloc_from_contiguous(dev, count, order);
struct bio_vec *bvec;
retry:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!page) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
- gfp_mask |= __GFP_WAIT;
+ gfp_mask |= __GFP_DIRECT_RECLAIM;
goto retry;
}
}
return_clone:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
return clone;
*pages = NULL;
do {
- pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+ pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
if (unlikely(!pl)) {
/* Use reserved pages */
pl = kc->pages;
solo_enc->vidq.ops = &solo_enc_video_qops;
solo_enc->vidq.mem_ops = &vb2_dma_sg_memops;
solo_enc->vidq.drv_priv = solo_enc;
- solo_enc->vidq.gfp_flags = __GFP_DMA32;
+ solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
solo_enc->vidq.lock = &solo_enc->lock;
solo_dev->vidq.mem_ops = &vb2_dma_contig_memops;
solo_dev->vidq.drv_priv = solo_dev;
solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
- solo_dev->vidq.gfp_flags = __GFP_DMA32;
+ solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
solo_dev->vidq.lock = &solo_dev->lock;
ret = vb2_queue_init(&solo_dev->vidq);
dev->vidq.ops = &tw68_video_qops;
dev->vidq.mem_ops = &vb2_dma_sg_memops;
dev->vidq.drv_priv = dev;
- dev->vidq.gfp_flags = __GFP_DMA32;
+ dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
dev->vidq.buf_struct_size = sizeof(struct tw68_buf);
dev->vidq.lock = &dev->lock;
dev->vidq.min_buffers_needed = 2;
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
- nid = cpu_to_node(cpu);
+ nid = cpu_to_mem(cpu);
page = __alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
pg_order);
/*
* Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
- * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+ * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
* __GFP_NOWARN, to suppress page allocation failure warnings.
*/
#define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN)
*/
void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
{
- gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
- __GFP_NORETRY | __GFP_NO_KSWAPD;
+ gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY;
size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
void *kbuf;
{
if (fp->rx_frag_size) {
/* GFP_KERNEL allocations are used only during initialization */
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfpflags_allow_blocking(gfp_mask)))
return (void *)__get_free_page(gfp_mask);
return netdev_alloc_frag(fp->rx_frag_size);
req->special = (void *)0;
if (buffer && bufflen) {
- ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+ ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_RECLAIM);
if (ret)
goto out;
} else if (ubuffer && bufflen) {
- ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+ ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_RECLAIM);
if (ret)
goto out;
bio = req->bio;
struct request *req;
/*
- * blk_get_request with GFP_KERNEL (__GFP_WAIT) sleeps until a
+ * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
* request becomes available
*/
req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
int write = (data_direction == DMA_TO_DEVICE);
int ret = DRIVER_ERROR << 24;
- req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);
+ req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM);
if (IS_ERR(req))
return ret;
blk_rq_set_block_pc(req);
if (bufflen && blk_rq_map_kern(sdev->request_queue, req,
- buffer, bufflen, __GFP_WAIT))
+ buffer, bufflen, __GFP_RECLAIM))
goto out;
req->cmd_len = COMMAND_SIZE(cmd[0]);
#include "ion_priv.h"
static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN |
- __GFP_NORETRY) & ~__GFP_WAIT;
+ __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM;
static gfp_t low_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN);
static const unsigned int orders[] = {8, 4, 0};
static const int num_orders = ARRAY_SIZE(orders);
do { \
LASSERT(!in_interrupt() || \
((size) <= LIBCFS_VMALLOC_SIZE && \
- ((mask) & __GFP_WAIT) == 0)); \
+ !gfpflags_allow_blocking(mask))); \
} while (0)
#define LIBCFS_ALLOC_POST(ptr, size) \
* heavy filesystem activity makes these fail, and we can
* use compound pages.
*/
- gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+ gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
/*
* The minimum size of the eager buffers is a groups of MTU-sized
* heavy filesystem activity makes these fail, and we can
* use compound pages.
*/
- gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+ gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
egrcnt = dd->ipath_rcvegrcnt;
/* TID number offset for this port */
static void handle_exception(struct fsg_common *common)
{
- siginfo_t info;
int i;
struct fsg_buffhd *bh;
enum fsg_state old_state;
* into a high-priority EXIT exception.
*/
for (;;) {
- int sig =
- dequeue_signal_lock(current, ¤t->blocked, &info);
+ int sig = kernel_dequeue_signal(NULL);
if (!sig)
break;
if (sig != SIGUSR1) {
{
struct u132 *u132 = hcd_to_u132(hcd);
if (irqs_disabled()) {
- if (__GFP_WAIT & mem_flags) {
+ if (gfpflags_allow_blocking(mem_flags)) {
printk(KERN_ERR "invalid context for function that might sleep\n");
return -EINVAL;
}
* below the first 16MB.
*/
- flags = __GFP_DMA | __GFP_HIGH;
+ flags = __GFP_DMA | __GFP_HIGH | __GFP_KSWAPD_RECLAIM;
va->logical =
__get_free_pages(flags, --max_order);
} while (va->logical == 0 && max_order > min_order);
if (res < 0 && fl->fl_type != F_UNLCK) {
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
- res = locks_lock_file_wait(filp, fl);
+ /* Even if this fails we want to return the remote error */
+ locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
out:
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > 16 * 1024 * 1024) {
u64 len;
while (start <= end) {
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+ gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
/*
* node records
int ret1;
struct address_space *mapping = inode->i_mapping;
struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out;
if (i_size_read(inode) == 0)
return;
page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
return;
if (PageUptodate(page)) {
struct page *page, *tpage;
unsigned int expected_index;
int rc;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
INIT_LIST_HEAD(tmplist);
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
{
struct task_struct *t;
int nr = 0;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
- t = start;
- do {
+ for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ }
return nr;
}
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
tsk->signal->group_exit_task = tsk;
- /* ignore all signals except SIGKILL, see prepare_signal() */
- tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
continue;
if (g->flags & PF_KTHREAD)
continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- p->signal->flags = SIGNAL_GROUP_EXIT;
- unlock_task_sighand(p, &flags);
- }
- break;
+
+ for_each_thread(g, p) {
+ if (unlikely(!p->mm))
+ continue;
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code,
+ SIGNAL_GROUP_EXIT);
+ unlock_task_sighand(p, &flags);
}
- } while_each_thread(g, p);
+ break;
+ }
}
rcu_read_unlock();
done:
/*
* bio_alloc() is guaranteed to return a bio when called with
- * __GFP_WAIT and we request a valid number of vectors.
+ * __GFP_RECLAIM and we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
int err = 0;
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping)))
+ mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
+ wait & ~__GFP_DIRECT_RECLAIM);
return try_to_free_buffers(page);
}
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
iput(old_inode);
old_inode = inode;
- filemap_fdatawait(mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(mapping);
cond_resched();
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
/*
* decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
*/
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
_debug("fscache writeout timeout page: %p{%lx}",
page, page->index);
- gfp &= ~__GFP_WAIT;
+ gfp &= ~__GFP_DIRECT_RECLAIM;
goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
delete_from_page_cache(page);
}
+static inline void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ unsigned long v_offset;
+
+ /*
+ * Can the expression below overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ v_offset = 0;
+
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+ }
+}
/*
* remove_inode_hugepages handles two distinct cases: truncation and hole
* punch. There are subtle differences in operation for each case.
-
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
u32 hash;
+ bool rsv_on_error;
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
mapping, next, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ */
+ if (page_mapped(page)) {
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ next * pages_per_huge_page(h),
+ (next + 1) * pages_per_huge_page(h));
+ }
+
lock_page(page);
if (page->index >= end) {
unlock_page(page);
}
/*
- * If page is mapped, it was faulted in after being
- * unmapped. Do nothing in this race case. In the
- * normal case page is not mapped.
+ * We must free the huge page and remove from page
+ * cache (remove_huge_page) BEFORE removing the
+ * region/reserve map (hugetlb_unreserve_pages).
+ * In rare out of memory conditions, removal of the
+ * region/reserve map could fail. Before free'ing
+ * the page, note PagePrivate which is used in case
+ * of error.
*/
- if (!page_mapped(page)) {
- bool rsv_on_error = !PagePrivate(page);
- /*
- * We must free the huge page and remove
- * from page cache (remove_huge_page) BEFORE
- * removing the region/reserve map
- * (hugetlb_unreserve_pages). In rare out
- * of memory conditions, removal of the
- * region/reserve map could fail. Before
- * free'ing the page, note PagePrivate which
- * is used in case of error.
- */
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(
- inode, next,
- next + 1, 1)))
- hugetlb_fix_reserve_counts(
- inode, rsv_on_error);
- }
+ rsv_on_error = !PagePrivate(page);
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode,
+ next, next + 1,
+ 1)))
+ hugetlb_fix_reserve_counts(inode,
+ rsv_on_error);
}
if (page->index > next)
clear_inode(inode);
}
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
- struct vm_area_struct *vma;
-
- /*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
- */
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
-
- /*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
- */
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (end) {
- end = ((end - start) << PAGE_SHIFT) +
- vma->vm_start + v_offset;
- if (end > vma->vm_end)
- end = vma->vm_end;
- } else
- end = vma->vm_end;
-
- unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
- }
-}
-
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
{
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
loff_t hole_start, hole_end;
/*
if (hole_end > hole_start) {
struct address_space *mapping = inode->i_mapping;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(hugetlb_falloc_waitq);
+ /*
+ * Page faults on the area to be hole punched must be stopped
+ * during the operation. Initialize struct and have
+ * inode->i_private point to it.
+ */
+ struct hugetlb_falloc hugetlb_falloc = {
+ .waitq = &hugetlb_falloc_waitq,
+ .start = hole_start >> hpage_shift,
+ .end = hole_end >> hpage_shift
+ };
mutex_lock(&inode->i_mutex);
+
+ /*
+ * inode->i_private will be checked in the page fault path.
+ * The locking assures that all writes to the structure are
+ * complete before assigning to i_private. A fault on another
+ * CPU will see the fully initialized structure.
+ */
+ spin_lock(&inode->i_lock);
+ inode->i_private = &hugetlb_falloc;
+ spin_unlock(&inode->i_lock);
+
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_end >> PAGE_SHIFT);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, hole_start, hole_end);
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&hugetlb_falloc_waitq);
+ spin_unlock(&inode->i_lock);
+
mutex_unlock(&inode->i_mutex);
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode->i_ctime = CURRENT_TIME;
- spin_lock(&inode->i_lock);
- inode->i_private = NULL;
- spin_unlock(&inode->i_lock);
out:
mutex_unlock(&inode->i_mutex);
return error;
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
- allow_signal(SIGCONT);
allow_signal(SIGHUP);
c->gc_task = current;
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
- siginfo_t info;
unsigned long signr;
if (try_to_freeze())
goto again;
- signr = dequeue_signal_lock(current, ¤t->blocked, &info);
+ signr = kernel_dequeue_signal(NULL);
switch(signr) {
case SIGSTOP:
jffs2_dbg(1, "%s(): SIGSTOP received\n",
__func__);
- set_current_state(TASK_STOPPED);
- schedule();
+ kernel_signal_stop();
break;
case SIGKILL:
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
- };
+ }
c->wbuf_ofs = 0xFFFFFFFF;
c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
filler_t *filler = super->s_devops->readpage;
struct page *page;
- BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
if (use_filler)
page = read_cache_page(mapping, index, filler, sb);
else {
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+ !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
- * wait for it if __GFP_WAIT is set. Even then, only wait 1
- * second and only if the 'bdi' is not congested.
+ * wait for it if the caller allows blocking. Even then,
+ * only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
- if ((gfp & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
/**
* nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
*/
static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
- const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock)
{
unsigned long nfree;
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ spin_unlock(lock);
return nfree;
}
/**
* nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
* @n: delta to be added
*/
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
- unsigned long group,
- struct nilfs_palloc_group_desc *desc,
- u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock, u32 n)
{
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ u32 nfree;
+
+ spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ nfree = le32_to_cpu(desc->pg_nfrees);
+ spin_unlock(lock);
+ return nfree;
}
/**
return ret;
}
+/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ brelse(prev->bh);
+ prev->bh = NULL;
+ }
+ spin_unlock(lock);
+ return nilfs_mdt_delete_block(inode, blkoff);
+}
+
/**
* nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
* @inode: inode of metadata file using this allocator
&cache->prev_bitmap, &cache->lock);
}
+/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+ unsigned long group)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode,
+ group),
+ &cache->prev_bitmap, &cache->lock);
+}
+
/**
* nilfs_palloc_get_entry_block - get buffer head of an entry block
* @inode: inode of metadata file using this allocator
&cache->prev_entry, &cache->lock);
}
+/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ &cache->prev_entry, &cache->lock);
+}
+
/**
* nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
* @inode: inode of metadata file using this allocator
/**
* nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
* @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
* @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
*/
-static int nilfs_palloc_find_available_slot(struct inode *inode,
- unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned char *bitmap,
- int bsize)
-{
- int curr, pos, end, i;
-
- if (target > 0) {
- end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, target);
- if (pos < end &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
- return pos;
- } else
- end = 0;
-
- for (i = 0, curr = end;
- i < bsize;
- i += BITS_PER_LONG, curr += BITS_PER_LONG) {
- /* wrap around */
- if (curr >= bsize)
- curr = 0;
- while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
- != ~0UL) {
- end = curr + BITS_PER_LONG;
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, curr);
- if ((pos < end) &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos,
- bitmap))
+ unsigned bsize,
+ spinlock_t *lock)
+{
+ int pos, end = bsize;
+
+ if (likely(target < bsize)) {
+ pos = target;
+ do {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
- }
+ } while (++pos < end);
+
+ end = target;
+ }
+
+ /* wrap around */
+ for (pos = 0; pos < end; pos++) {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+ return pos;
}
+
return -ENOSPC;
}
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
- unsigned long n, entries_per_group, groups_per_desc_block;
+ unsigned long n, entries_per_group;
unsigned long i, j;
+ spinlock_t *lock;
int pos, ret;
ngroups = nilfs_palloc_groups_count(inode);
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
- groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
for (j = 0; j < n; j++, desc++, group++) {
- if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
- > 0) {
+ lock = nilfs_mdt_bgl_lock(inode, group);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
ret = nilfs_palloc_get_bitmap_block(
inode, group, 1, &bitmap_bh);
if (ret < 0)
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- inode, group, group_offset, bitmap,
- entries_per_group);
+ bitmap, group_offset,
+ entries_per_group, lock);
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
- inode, group, desc, -1);
+ desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
kunmap(desc_bh->b_page);
unsigned long group, group_offset;
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+ lock = nilfs_mdt_bgl_lock(inode, group);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
void *desc_kaddr, *bitmap_kaddr;
unsigned char *bitmap;
unsigned long group, group_offset;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
req->pr_desc_bh = NULL;
}
-/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
- __u64 first, last;
-
- first = group * nilfs_palloc_entries_per_group(inode);
- last = first + nilfs_palloc_entries_per_group(inode) - 1;
- return (nr >= first) && (nr <= last);
-}
-
/**
* nilfs_palloc_freev - deallocate a set of persistent objects
* @inode: inode of metadata file using this allocator
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
- int i, j, n, ret;
+ __u64 group_min_nr, last_nrs[8];
+ const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+ const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned entry_start, end, pos;
+ spinlock_t *lock;
+ int i, j, k, ret;
+ u32 nfree;
for (i = 0; i < nitems; i = j) {
+ int change_group = false;
+ int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
brelse(desc_bh);
return ret;
}
- desc_kaddr = kmap(desc_bh->b_page);
- desc = nilfs_palloc_block_get_group_desc(
- inode, group, desc_bh, desc_kaddr);
+
+ /* Get the first entry number of the group */
+ group_min_nr = (__u64)group * epg;
+
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- for (j = i, n = 0;
- (j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
- j++) {
- nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
- if (!nilfs_clear_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap)) {
- printk(KERN_WARNING
- "%s: entry number %llu already freed\n",
- __func__,
- (unsigned long long)entry_nrs[j]);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ j = i;
+ entry_start = rounddown(group_offset, epb);
+ do {
+ if (!nilfs_clear_bit_atomic(lock, group_offset,
+ bitmap)) {
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
- }
- nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+ j++;
+ if (j >= nitems || entry_nrs[j] < group_min_nr ||
+ entry_nrs[j] >= group_min_nr + epg) {
+ change_group = true;
+ } else {
+ group_offset = entry_nrs[j] - group_min_nr;
+ if (group_offset >= entry_start &&
+ group_offset < entry_start + epb) {
+ /* This entry is in the same block */
+ continue;
+ }
+ }
+
+ /* Test if the entry block is empty or not */
+ end = entry_start + epb;
+ pos = nilfs_find_next_bit(bitmap, end, entry_start);
+ if (pos >= end) {
+ last_nrs[nempties++] = entry_nrs[j - 1];
+ if (nempties >= ARRAY_SIZE(last_nrs))
+ break;
+ }
+
+ if (change_group)
+ break;
+
+ /* Go on to the next entry block */
+ entry_start = rounddown(group_offset, epb);
+ } while (true);
kunmap(bitmap_bh->b_page);
- kunmap(desc_bh->b_page);
+ mark_buffer_dirty(bitmap_bh);
+ brelse(bitmap_bh);
+ for (k = 0; k < nempties; k++) {
+ ret = nilfs_palloc_delete_entry_block(inode,
+ last_nrs[k]);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ (unsigned long long)last_nrs[k],
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
+
+ desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+ nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+ kunmap_atomic(desc_kaddr);
mark_buffer_dirty(desc_bh);
- mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
-
- brelse(bitmap_bh);
brelse(desc_bh);
+
+ if (nfree == nilfs_palloc_entries_per_group(inode)) {
+ ret = nilfs_palloc_delete_bitmap_block(inode, group);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ group,
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
}
return 0;
}
#define nilfs_set_bit_atomic ext2_set_bit_atomic
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit find_next_zero_bit_le
+#define nilfs_find_next_bit find_next_bit_le
/**
* struct nilfs_bh_assoc - block offset and buffer head association
int level, __u64 *keyp, __u64 *ptrp)
{
struct nilfs_btree_node *node, *right;
- __u64 newkey;
- __u64 newptr;
int nchildren, n, move, ncblk;
node = nilfs_btree_get_nonroot_node(path, level);
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(right, 0);
- newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
__u64 key, __u64 ptr,
const __u64 *keys, const __u64 *ptrs, int n)
{
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
- __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
- start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
return 0;
failed_unmap:
#include "page.h"
#include "mdt.h"
+#include <trace/events/nilfs2.h>
#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+ trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
}
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
out:
get_bh(bh);
*out_bh = bh;
}
/* Default GFP flags using highmem */
-#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
NILFS_ST_DONE,
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+ sci->sc_stage.scnt++;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+ sci->sc_stage.scnt = next_scnt;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+ return sci->sc_stage.scnt;
+}
+
/* State flags of collection */
#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
{
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+ struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
return ret;
- if (ret > 0)
+ if (ret > 0) {
+ trace_ti = current->journal_info;
+
+ trace_nilfs2_transaction_transition(sb, trace_ti,
+ trace_ti->ti_count, trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+ }
sb_start_intwrite(sb);
ret = -ENOSPC;
goto failed;
}
+
+ trace_ti = current->journal_info;
+ trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+ trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
failed:
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_SYNC)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
current->journal_info = ti;
for (;;) {
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
down_write(&nilfs->ns_segctor_sem);
if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
break;
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}
static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
size_t ndone;
int err = 0;
- switch (sci->sc_stage.scnt) {
+ switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
- sci->sc_stage.scnt = NILFS_ST_DSYNC;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DAT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
}
sci->sc_stage.dirty_file_ptr = NULL;
if (mode == SC_FLUSH_FILE) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
/* Fall through */
case NILFS_ST_IFILE:
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
if (unlikely(err))
break;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
}
/* End of a logical segment */
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DSYNC:
dsync_mode:
if (unlikely(err))
break;
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DONE:
return 0;
goto failed;
/* The current segment is filled up */
- if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+ if (mode != SC_LSEG_SR ||
+ nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
break;
nilfs_clear_logs(&sci->sc_segbufs);
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
- sci->sc_stage.scnt = NILFS_ST_INIT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
err = nilfs_segctor_collect_dirty_files(sci, nilfs);
goto failed;
/* Avoid empty segment */
- if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
nilfs_segbuf_empty(sci->sc_curseg)) {
nilfs_segctor_abort_construction(sci, nilfs, 1);
goto out;
nilfs_segctor_fill_in_file_bmap(sci);
if (mode == SC_LSEG_SR &&
- sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+ nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
if (unlikely(err))
goto failed_to_write;
- if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
/*
* At this point, we avoid double buffering
if (err)
goto failed_to_write;
}
- } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
out:
nilfs_segctor_drop_written_files(sci, nilfs);
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
int mode = 0;
- int err;
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
spin_unlock(&sci->sc_state_lock);
if (mode) {
- err = nilfs_segctor_do_construct(sci, mode);
+ nilfs_segctor_do_construct(sci, mode);
spin_lock(&sci->sc_state_lock);
sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
/**
* struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
* @flags: State flags
* @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
* @gc_inode_ptr: Pointer on the list of gc-inodes
#include "mdt.h"
#include "sufile.h"
+#include <trace/events/nilfs2.h>
+
/**
* struct nilfs_sufile_info - on-memory private data of sufile
* @mi: on-memory private data of metadata file
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus, cnt;
+ unsigned long nsegments, nsus, cnt;
int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
- ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr);
break; /* never happens */
}
}
+ trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
if (ret < 0)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+ trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
NILFS_SUI(sufile)->ncleansegs++;
nilfs_mdt_mark_dirty(sufile);
+
+ trace_nilfs2_segment_usage_freed(sufile, segnum);
}
/**
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
- int sb2i = -1; /* array index of the secondary superblock */
+ int sb2i; /* array index of the secondary superblock */
int ret = 0;
/* nilfs->ns_sem must be locked by the caller. */
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ } else {
+ sb2i = -1;
+ blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out; /* super block location is unchanged */
*/
rcu_barrier();
- if (nilfs_inode_cachep)
- kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
- kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
- kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
- kmem_cache_destroy(nilfs_btree_path_cache);
+ kmem_cache_destroy(nilfs_inode_cachep);
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ kmem_cache_destroy(nilfs_btree_path_cache);
}
static int __init nilfs_init_cachep(void)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(mark->inode);
if (inode) {
+ /*
+ * IN_ALL_EVENTS represents all of the mask bits
+ * that we expose to userspace. There is at
+ * least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+ u32 mask = mark->mask & IN_ALL_EVENTS;
seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mark->mask, mark->ignored_mask);
+ mask, mark->ignored_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
int ret;
unsigned flags = 0;
- /* don't allow invalid bits: we don't want flags set */
+ /*
+ * We share a lot of code with fs/dnotify. We also share
+ * the bit layout between inotify's IN_* and the fsnotify
+ * FS_*. This check ensures that only the inotify IN_*
+ * bits get passed in and set in watches/events.
+ */
+ if (unlikely(mask & ~ALL_INOTIFY_BITS))
+ return -EINVAL;
+ /*
+ * Require at least one valid bit set in the mask.
+ * Without _something_ set, we would have no events to
+ * watch for.
+ */
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
+
/*
* Describe the state of a single cluster to be written to.
*/
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
}
}
+/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
+}
+
/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
unsigned hr_unclean_stop:1,
hr_aborted_start:1,
hr_item_pinned:1,
- hr_item_dropped:1;
+ hr_item_dropped:1,
+ hr_node_deleted:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
set_user_nice(current, MIN_NICE);
/* Pin node */
- o2nm_depend_this_node();
+ ret = o2nm_depend_this_node();
+ if (ret) {
+ mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+ reg->hr_node_deleted = 1;
+ wake_up(&o2hb_steady_queue);
+ return 0;
+ }
while (!kthread_should_stop() &&
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
spin_unlock(&o2hb_live_lock);
ret = wait_event_interruptible(o2hb_steady_queue,
- atomic_read(®->hr_steady_iterations) == 0);
+ atomic_read(®->hr_steady_iterations) == 0 ||
+ reg->hr_node_deleted);
if (ret) {
atomic_set(®->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
goto out3;
}
+ if (reg->hr_node_deleted) {
+ ret = -EINVAL;
+ goto out3;
+ }
+
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
- int dropped_dio = 0;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
if (direct_io && !is_sync_kiocb(iocb))
unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
if (unaligned_dio) {
/*
* Wait on previous unaligned aio to complete before
goto no_sync;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
};
static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (name_len == 2 && !strncmp("..", name, 2))
return 0;
+ /* do not include dio entry in case of orphan scan */
+ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+ (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+ return 0;
+
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
+ if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN))
+ OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
/* Skip inodes which are already added to recover list, since dio may
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
- struct inode **head)
+ struct inode **head,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int status;
struct inode *orphan_dir_inode = NULL;
struct ocfs2_orphan_filldir_priv priv = {
.ctx.actor = ocfs2_orphan_filldir,
.osb = osb,
- .head = *head
+ .head = *head,
+ .orphan_reco_type = orphan_reco_type
};
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
- ret = ocfs2_queue_orphans(osb, slot, &inode);
+ ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
ocfs2_clear_recovering_orphan_dir(osb, slot);
/* Error here should be noted, but we want to continue with as
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
- mutex_lock(&inode->i_mutex);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto next;
- }
- /*
- * We need to take and drop the inode lock to
- * force read inode from disk.
- */
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
- goto unlock_rw;
- }
+ if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+ mutex_lock(&inode->i_mutex);
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock_mutex;
+ }
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock_rw;
+ }
- di = (struct ocfs2_dinode *)di_bh->b_data;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
- if (inode->i_nlink == 0) {
+ if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto unlock_inode;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode,
+ di_bh, 0, 0);
+ if (ret)
+ mlog_errno(ret);
+ }
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+ mutex_unlock(&inode->i_mutex);
+
+ /* clear dio flag in ocfs2_inode_info */
+ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+ } else {
spin_lock(&oi->ip_lock);
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
spin_unlock(&oi->ip_lock);
}
- if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
- (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size_read(inode));
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto unlock_inode;
- }
-
- ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
- if (ret)
- mlog_errno(ret);
- } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-unlock_rw:
- ocfs2_rw_unlock(inode, 1);
-next:
- mutex_unlock(&inode->i_mutex);
iput(inode);
inode = iter;
}
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
return status;
}
- return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
+ if (status < 0) {
+ u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+ int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+ inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+ if (tmp)
+ mlog_errno(tmp);
+ }
+
+ return status;
}
static int ocfs2_mkdir(struct inode *dir,
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
- hint = ocfs2_group_from_res(res);
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+ hint = res->sr_bg_blkno;
+ else
+ hint = ocfs2_group_from_res(res);
goto set_hint;
}
if (status < 0 && status != -ENOSPC) {
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
char *buf;
+ size_t size;
char tcomm[sizeof(p->comm)];
+ int ret;
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- buf = m->buf + m->count;
- /* Ignore error for now */
- buf += string_escape_str(tcomm, buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ size = seq_get_buf(m, &buf);
+ ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ seq_commit(m, ret < size ? ret : -1);
- m->count = buf - m->buf;
seq_putc(m, '\n');
}
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
+ hugetlb_report_usage(m, mm);
}
unsigned long task_vsize(struct mm_struct *mm)
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long shared_hugetlb;
+ unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? HPAGE_PMD_NR : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
+ }
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
}
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
seq_putc(m, '\n');
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ }
+ if (page) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2)
+ mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
+ return 0;
+}
+#endif /* HUGETLB_PAGE */
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+ .hugetlb_entry = smaps_hugetlb_range,
+#endif
.mm = vma->vm_mm,
.private = &mss,
};
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %8lu kB\n"
"Swap: %8lu kB\n"
"SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.shared_hugetlb >> 10,
+ mss.private_hugetlb >> 10,
mss.swap >> 10,
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
pte_t ptent = *pte;
if (pte_present(ptent)) {
+ ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
ptent = pte_wrprotect(ptent);
ptent = pte_clear_soft_dirty(ptent);
+ ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
}
-
- set_pte_at(vma->vm_mm, addr, pte, ptent);
}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+#endif
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = *pmdp;
+ pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
-
#else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
+#include <linux/string_helpers.h>
#include <asm/uaccess.h>
#include <asm/page.h>
static void *seq_buf_alloc(unsigned long size)
{
void *buf;
+ gfp_t gfp = GFP_KERNEL;
/*
- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
- * it's better to fall back to vmalloc() than to kill things.
+ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+ * it's better to fall back to vmalloc() than to kill things. For small
+ * allocations, just use GFP_KERNEL which will oom kill, thus no need
+ * for vmalloc fallback.
*/
- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ if (size > PAGE_SIZE)
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ buf = kmalloc(size, gfp);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *end = m->buf + m->size;
- char *p;
- char c;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
- if (!strchr(esc, c)) {
- *p++ = c;
- continue;
- }
- if (p + 3 < end) {
- *p++ = '\\';
- *p++ = '0' + ((c & 0300) >> 6);
- *p++ = '0' + ((c & 070) >> 3);
- *p++ = '0' + (c & 07);
- continue;
- }
- seq_set_overflow(m);
- return;
- }
- m->count = p - m->buf;
+ ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+ seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);
{
const u8 *ptr = buf;
int i, linelen, remaining = len;
+ char *buffer;
+ size_t size;
int ret;
if (rowsize != 16 && rowsize != 32)
break;
}
+ size = seq_get_buf(m, &buffer);
ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
- m->buf + m->count, m->size - m->count,
- ascii);
- if (ret >= m->size - m->count) {
- seq_set_overflow(m);
- } else {
- m->count += ret;
- seq_putc(m, '\n');
- }
+ buffer, size, ascii);
+ seq_commit(m, ret < size ? ret : -1);
+
+ seq_putc(m, '\n');
}
}
EXPORT_SYMBOL(seq_hex_dump);
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
- filemap_fdatawait(bdev->bd_inode->i_mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
}
/*
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
return NULL;
cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, flag);
return cpu_addr;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
-#endif
-
#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
{
return 0;
}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return 0;
-}
#ifndef __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
struct drm_sg_mem *sg; /**< Scatter gather memory */
unsigned int num_crtcs; /**< Number of CRTCs on this device */
- sigset_t sigmask;
struct {
int context;
* sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
* @value: value to sign extend
* @index: 0 based bit index (0<=index<32) to sign bit
+ *
+ * This is safe to use for 16- and 8-bit types as well.
*/
static inline __s32 sign_extend32(__u32 value, int index)
{
return (__s32)(value << shift) >> shift;
}
+/**
+ * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
+ * @value: value to sign extend
+ * @index: 0 based bit index (0<=index<64) to sign bit
+ */
+static inline __s64 sign_extend64(__u64 value, int index)
+{
+ __u8 shift = 63 - index;
+ return (__s64)(value << shift) >> shift;
+}
+
static inline unsigned fls_long(unsigned long l)
{
if (sizeof(l) == 4)
/* For more detailed tracepoint output */
#define COMPACT_NO_SUITABLE_PAGE 5
#define COMPACT_NOT_SUITABLE_ZONE 6
-/* When adding new state, please change compaction_status_string, too */
+#define COMPACT_CONTENDED 7
+/* When adding new states, please adjust include/trace/events/compaction.h */
/* Used to signal whether compaction detected need_sched() or lock contention */
/* No contention detected */
#if GCC_VERSION >= 40600
/*
- * Tell the optimizer that something else uses this function or variable.
+ * When used with Link Time Optimization, gcc can optimize away C functions or
+ * variables which are referenced only from assembly code. __visible tells the
+ * optimizer that something else uses this function or variable, thus preventing
+ * this.
*/
#define __visible __attribute__((externally_visible))
#endif
+
+#if GCC_VERSION >= 40900 && !defined(__CHECKER__)
+/*
+ * __assume_aligned(n, k): Tell the optimizer that the returned
+ * pointer can be assumed to be k modulo n. The second argument is
+ * optional (default 0), so we use a variadic macro to make the
+ * shorthand.
+ *
+ * Beware: Do not apply this to functions which may return
+ * ERR_PTRs. Also, it is probably unwise to apply it to functions
+ * returning extra information in the low bits (but in that case the
+ * compiler should see some alignment anyway, when the return value is
+ * massaged by 'flags = ptr & 3; ptr &= ~3;').
+ */
+#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
+#endif
+
/*
* GCC 'asm goto' miscompiles certain code sequences:
*
#define __visible
#endif
+/*
+ * Assume alignment of return value.
+ */
+#ifndef __assume_aligned
+#define __assume_aligned(a, ...)
+#endif
+
+
/* Are two types/vars the same type (ignoring qualifiers)? */
#ifndef __same_type
# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
extern void rebuild_sched_domains(void);
-extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+extern void cpuset_print_current_mems_allowed(void);
/*
* read_mems_allowed_begin is required when making decisions involving
*/
static inline unsigned int read_mems_allowed_begin(void)
{
+ if (!cpusets_enabled())
+ return 0;
+
return read_seqcount_begin(¤t->mems_allowed_seq);
}
*/
static inline bool read_mems_allowed_retry(unsigned int seq)
{
+ if (!cpusets_enabled())
+ return false;
+
return read_seqcount_retry(¤t->mems_allowed_seq, seq);
}
partition_sched_domains(1, NULL, NULL);
}
-static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
+static inline void cpuset_print_current_mems_allowed(void)
{
}
--- /dev/null
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ gfp_t flags);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ gfp_t flags)
{
}
#ifndef _LINUX_DMA_MAPPING_H
#define _LINUX_DMA_MAPPING_H
+#include <linux/sizes.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/err.h>
static inline unsigned int dma_get_max_seg_size(struct device *dev)
{
- return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536;
+ if (dev->dma_parms && dev->dma_parms->max_segment_size)
+ return dev->dma_parms->max_segment_size;
+ return SZ_64K;
}
static inline unsigned int dma_set_max_seg_size(struct device *dev,
if (dev->dma_parms) {
dev->dma_parms->max_segment_size = size;
return 0;
- } else
- return -EIO;
+ }
+ return -EIO;
}
static inline unsigned long dma_get_seg_boundary(struct device *dev)
{
- return dev->dma_parms ?
- dev->dma_parms->segment_boundary_mask : 0xffffffff;
+ if (dev->dma_parms && dev->dma_parms->segment_boundary_mask)
+ return dev->dma_parms->segment_boundary_mask;
+ return DMA_BIT_MASK(32);
}
static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
if (dev->dma_parms) {
dev->dma_parms->segment_boundary_mask = mask;
return 0;
- } else
- return -EIO;
+ }
+ return -EIO;
}
#ifndef dma_max_pfn
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
+extern void filemap_fdatawait_keep_errors(struct address_space *);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
#define ___GFP_HIGHMEM 0x02u
#define ___GFP_DMA32 0x04u
#define ___GFP_MOVABLE 0x08u
-#define ___GFP_WAIT 0x10u
+#define ___GFP_RECLAIMABLE 0x10u
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_NOMEMALLOC 0x10000u
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
-#define ___GFP_RECLAIMABLE 0x80000u
+#define ___GFP_ATOMIC 0x80000u
#define ___GFP_NOACCOUNT 0x100000u
#define ___GFP_NOTRACK 0x200000u
-#define ___GFP_NO_KSWAPD 0x400000u
+#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
#define ___GFP_WRITE 0x1000000u
+#define ___GFP_KSWAPD_RECLAIM 0x2000000u
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
- * GFP bitmasks..
- *
- * Zone modifiers (see linux/mmzone.h - low three bits)
+ * Physical address zone modifiers (see linux/mmzone.h - low four bits)
*
* Do not put any conditional on these. If necessary modify the definitions
* without the underscores and use them consistently. The definitions here may
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
+#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
+
+/*
+ * Page mobility and placement hints
+ *
+ * These flags provide hints about how mobile the page is. Pages with similar
+ * mobility are placed within the same pageblocks to minimise problems due
+ * to external fragmentation.
+ *
+ * __GFP_MOVABLE (also a zone modifier) indicates that the page can be
+ * moved by page migration during memory compaction or can be reclaimed.
+ *
+ * __GFP_RECLAIMABLE is used for slab allocations that specify
+ * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
+ *
+ * __GFP_WRITE indicates the caller intends to dirty the page. Where possible,
+ * these pages will be spread between local zones to avoid all the dirty
+ * pages being in one zone (fair zone allocation policy).
+ *
+ * __GFP_HARDWALL enforces the cpuset memory allocation policy.
+ *
+ * __GFP_THISNODE forces the allocation to be satisified from the requested
+ * node with no fallbacks or placement policy enforcements.
+ */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
+#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
+#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
+
/*
- * Action modifiers - doesn't change the zoning
+ * Watermark modifiers -- controls access to emergency reserves
+ *
+ * __GFP_HIGH indicates that the caller is high-priority and that granting
+ * the request is necessary before the system can make forward progress.
+ * For example, creating an IO context to clean pages.
+ *
+ * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
+ * high priority. Users are typically interrupt handlers. This may be
+ * used in conjunction with __GFP_HIGH
+ *
+ * __GFP_MEMALLOC allows access to all memory. This should only be used when
+ * the caller guarantees the allocation will allow more memory to be freed
+ * very shortly e.g. process exiting or swapping. Users either should
+ * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ *
+ * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
+ * This takes precedence over the __GFP_MEMALLOC flag if both are set.
+ *
+ * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement.
+ */
+#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
+#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
+#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
+#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT)
+
+/*
+ * Reclaim modifiers
+ *
+ * __GFP_IO can start physical IO.
+ *
+ * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the
+ * allocator recursing into the filesystem which might already be holding
+ * locks.
+ *
+ * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
+ * This flag can be cleared to avoid unnecessary delays when a fallback
+ * option is available.
+ *
+ * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
+ * the low watermark is reached and have it reclaim pages until the high
+ * watermark is reached. A caller may wish to clear this flag when fallback
+ * options are available and the reclaim is likely to disrupt the system. The
+ * canonical example is THP allocation where a fallback is cheap but
+ * reclaim/compaction may cause indirect stalls.
+ *
+ * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
*
* __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
- * _might_ fail. This depends upon the particular VM implementation.
+ * _might_ fail. This depends upon the particular VM implementation.
*
* __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures. New users should be evaluated carefully
- * (and the flag should be used only when there is no reasonable failure policy)
- * but it is definitely preferable to use the flag rather than opencode endless
- * loop around allocator.
+ * cannot handle allocation failures. New users should be evaluated carefully
+ * (and the flag should be used only when there is no reasonable failure
+ * policy) but it is definitely preferable to use the flag rather than
+ * opencode endless loop around allocator.
*
* __GFP_NORETRY: The VM implementation must not retry indefinitely and will
- * return NULL when direct reclaim and memory compaction have failed to allow
- * the allocation to succeed. The OOM killer is not called with the current
- * implementation.
- *
- * __GFP_MOVABLE: Flag that this page will be movable by the page migration
- * mechanism or reclaimed
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed. The OOM killer is not called with the current
+ * implementation.
*/
-#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */
-#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */
-#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */
-#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */
-#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */
-#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */
-#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
-#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
-#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
-#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
-#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
-#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
- * This takes precedence over the
- * __GFP_MEMALLOC flag if both are
- * set
- */
-#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
-#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
-#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
-#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */
-#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
-
-#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
-#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
+#define __GFP_IO ((__force gfp_t)___GFP_IO)
+#define __GFP_FS ((__force gfp_t)___GFP_FS)
+#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
+#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
+#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
+#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT)
+#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL)
+#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY)
/*
- * This may seem redundant, but it's a way of annotating false positives vs.
- * allocations that simply cannot be supported (e.g. page tables).
+ * Action modifiers
+ *
+ * __GFP_COLD indicates that the caller does not expect to be used in the near
+ * future. Where possible, a cache-cold page will be returned.
+ *
+ * __GFP_NOWARN suppresses allocation failure reports.
+ *
+ * __GFP_COMP address compound page metadata.
+ *
+ * __GFP_ZERO returns a zeroed page on success.
+ *
+ * __GFP_NOTRACK avoids tracking with kmemcheck.
+ *
+ * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
+ * distinguishing in the source between false positives and allocations that
+ * cannot be supported (e.g. page tables).
+ *
+ * __GFP_OTHER_NODE is for allocations that are on a remote node but that
+ * should not be accounted for as a remote allocation in vmstat. A
+ * typical user would be khugepaged collapsing a huge page on a remote
+ * node.
*/
+#define __GFP_COLD ((__force gfp_t)___GFP_COLD)
+#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
+#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
+#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
+#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE)
-#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
+/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 26
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-/* This equals 0, but use constants in case they ever change */
-#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
-/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
-#define GFP_ATOMIC (__GFP_HIGH)
-#define GFP_NOIO (__GFP_WAIT)
-#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
-#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
+/*
+ * Useful GFP flag combinations that are commonly used. It is recommended
+ * that subsystems start with one of these combinations and then set/clear
+ * __GFP_FOO flags as necessary.
+ *
+ * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
+ * watermark is applied to allow access to "atomic reserves"
+ *
+ * GFP_KERNEL is typical for kernel-internal allocations. The caller requires
+ * ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
+ *
+ * GFP_NOWAIT is for kernel allocations that should not stall for direct
+ * reclaim, start physical IO or use any filesystem callback.
+ *
+ * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
+ * that do not require the starting of any physical IO.
+ *
+ * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ *
+ * GFP_USER is for userspace allocations that also need to be directly
+ * accessibly by the kernel or hardware. It is typically used by hardware
+ * for buffers that are mapped to userspace (e.g. graphics) that hardware
+ * still must DMA to. cpuset limits are enforced for these allocations.
+ *
+ * GFP_DMA exists for historical reasons and should be avoided where possible.
+ * The flags indicates that the caller requires that the lowest zone be
+ * used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
+ * it would require careful auditing as some users really require it and
+ * others use the flag to avoid lowmem reserves in ZONE_DMA and treat the
+ * lowest zone as a type of emergency reserve.
+ *
+ * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit
+ * address.
+ *
+ * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
+ * do not need to be directly accessible by the kernel but that cannot
+ * move once in use. An example may be a hardware allocation that maps
+ * data directly into userspace but has no addressing limitations.
+ *
+ * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
+ * need direct access to but can use kmap() when access is required. They
+ * are expected to be movable via page reclaim or page migration. Typically,
+ * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
+ *
+ * GFP_TRANSHUGE is used for THP allocations. They are compound allocations
+ * that will fail quickly if memory is not available and will not wake
+ * kswapd on failure.
+ */
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
+#define GFP_NOIO (__GFP_RECLAIM)
+#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
+#define GFP_TEMPORARY (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \
__GFP_RECLAIMABLE)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_DMA __GFP_DMA
+#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
-#define GFP_IOFS (__GFP_IO | __GFP_FS)
-#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
- __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
- __GFP_NO_KSWAPD)
+#define GFP_IOFS (__GFP_IO | __GFP_FS | __GFP_KSWAPD_RECLAIM)
+#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
+ ~__GFP_KSWAPD_RECLAIM)
-/* This mask makes up all the page movable related flags */
+/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
+#define GFP_MOVABLE_SHIFT 3
-/* Control page allocator reclaim behavior */
-#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
- __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
- __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
-
-/* Control slab gfp mask during early boot */
-#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
-
-/* Control allocation constraints */
-#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
-
-/* Do not use these with a slab allocator */
-#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-
-/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
- platforms, used as appropriate on others */
-
-#define GFP_DMA __GFP_DMA
-
-/* 4GB DMA on some platforms */
-#define GFP_DMA32 __GFP_DMA32
-
-/* Convert GFP flags to their corresponding migrate type */
static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
- WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
+ VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
+ BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
+ BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
/* Group based on mobility */
- return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
- ((gfp_flags & __GFP_RECLAIMABLE) != 0);
+ return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
+}
+#undef GFP_MOVABLE_MASK
+#undef GFP_MOVABLE_SHIFT
+
+static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
+{
+ return gfp_flags & __GFP_DIRECT_RECLAIM;
}
#ifdef CONFIG_HIGHMEM
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
#endif
};
-enum page_check_address_pmd_flag {
- PAGE_CHECK_ADDRESS_PMD_FLAG,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
extern pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag,
spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#endif /* CONFIG_DEBUG_VM */
extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+
+int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list(page, NULL);
}
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd) \
+void deferred_split_huge_page(struct page *page);
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address);
+
+#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
- if (unlikely(pmd_trans_huge(*____pmd))) \
- __split_huge_page_pmd(__vma, __address, \
- ____pmd); \
+ if (pmd_trans_huge(*____pmd)) \
+ __split_huge_pmd(__vma, __pmd, __address); \
} while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { \
- pmd_t *____pmd = (__pmd); \
- anon_vma_lock_write(__anon_vma); \
- anon_vma_unlock_write(__anon_vma); \
- BUG_ON(pmd_trans_splitting(*____pmd) || \
- pmd_trans_huge(*____pmd)); \
- } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd);
+
#if HPAGE_PMD_ORDER >= MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
unsigned long start,
unsigned long end,
long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl);
/* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (pmd_trans_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma, ptl);
else
- return 0;
+ return false;
}
static inline int hpage_nr_pages(struct page *page)
{
{
return 0;
}
-#define split_huge_page_pmd(__vma, __address, __pmd) \
- do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd) \
- do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+static inline void deferred_split_huge_page(struct page *page) {}
+#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
long adjust_next)
{
}
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
- return 0;
+ return false;
}
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
+/*
+ * hugetlb_falloc is used to prevent page faults during falloc hole punch
+ * operations. During hole punch, inode->i_private points to this struct.
+ */
+struct hugetlb_falloc {
+ wait_queue_head_t *waitq; /* Page faults waiting on hole punch */
+ pgoff_t start; /* Start of fallocate hole */
+ pgoff_t end; /* End of fallocate hole */
+};
+
extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
+
+static inline void hugetlb_count_add(long l, struct mm_struct *mm)
+{
+ atomic_long_add(l, &mm->hugetlb_usage);
+}
+
+static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
+{
+ atomic_long_sub(l, &mm->hugetlb_usage);
+}
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page(v, a, r) NULL
{
return &mm->page_table_lock;
}
+
+static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
+{
+}
+
+static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
+{
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
- return (struct hugetlb_cgroup *)page[2].lru.next;
+ return (struct hugetlb_cgroup *)page[2].private;
}
static inline
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return -1;
- page[2].lru.next = (void *)h_cg;
+ page[2].private = (unsigned long)h_cg;
return 0;
}
char *kasprintf(gfp_t gfp, const char *fmt, ...);
extern __printf(2, 0)
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
+extern __printf(2, 0)
+const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
extern __scanf(2, 3)
int sscanf(const char *, const char *, ...);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
phys_addr_t base, phys_addr_t size,
int nid, unsigned long flags);
-int memblock_remove_range(struct memblock_type *type,
- phys_addr_t base,
- phys_addr_t size);
-
void __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp);
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound);
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+ bool lrucare, bool compound);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
- bool lrucare);
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
return mz->lru_size[lru];
}
-static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
unsigned long inactive_ratio;
unsigned long inactive;
return inactive * inactive_ratio < active;
}
+void mem_cgroup_handle_over_high(void);
+
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
static inline void mem_cgroup_oom_enable(void)
{
- WARN_ON(current->memcg_oom.may_oom);
- current->memcg_oom.may_oom = 1;
+ WARN_ON(current->memcg_may_oom);
+ current->memcg_may_oom = 1;
}
static inline void mem_cgroup_oom_disable(void)
{
- WARN_ON(!current->memcg_oom.may_oom);
- current->memcg_oom.may_oom = 0;
+ WARN_ON(!current->memcg_may_oom);
+ current->memcg_may_oom = 0;
}
static inline bool task_in_memcg_oom(struct task_struct *p)
{
- return p->memcg_oom.memcg;
+ return p->memcg_in_oom;
}
bool mem_cgroup_oom_synchronize(bool wait);
static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask,
- struct mem_cgroup **memcgp)
+ struct mem_cgroup **memcgp,
+ bool compound)
{
*memcgp = NULL;
return 0;
static inline void mem_cgroup_commit_charge(struct page *page,
struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
}
static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg,
+ bool compound)
{
}
{
}
-static inline void mem_cgroup_migrate(struct page *oldpage,
- struct page *newpage,
- bool lrucare)
+static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
{
}
return true;
}
-static inline int
+static inline bool
mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
- return 1;
+ return true;
}
static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
{
}
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
static inline void mem_cgroup_oom_enable(void)
{
}
* conditions, but because they are pretty simple, they are expected to be
* fast.
*/
-bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
- int order);
-void __memcg_kmem_commit_charge(struct page *page,
- struct mem_cgroup *memcg, int order);
-void __memcg_kmem_uncharge_pages(struct page *page, int order);
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg);
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void __memcg_kmem_uncharge(struct page *page, int order);
/*
* helper for acessing a memcg's index. It will be used as an index in the
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
-
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages);
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
-
-/**
- * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
- * @gfp: the gfp allocation flags.
- * @memcg: a pointer to the memcg this was charged against.
- * @order: allocation order.
- *
- * returns true if the memcg where the current task belongs can hold this
- * allocation.
- *
- * We return true automatically if this allocation is not to be accounted to
- * any memcg.
- */
-static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+static inline bool __memcg_kmem_bypass(gfp_t gfp)
{
if (!memcg_kmem_enabled())
return true;
-
if (gfp & __GFP_NOACCOUNT)
return true;
- /*
- * __GFP_NOFAIL allocations will move on even if charging is not
- * possible. Therefore we don't even try, and have this allocation
- * unaccounted. We could in theory charge it forcibly, but we hope
- * those allocations are rare, and won't be worth the trouble.
- */
- if (gfp & __GFP_NOFAIL)
- return true;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return true;
-
- /* If the test is dying, just let it go. */
- if (unlikely(fatal_signal_pending(current)))
- return true;
-
- return __memcg_kmem_newpage_charge(gfp, memcg, order);
+ return false;
}
/**
- * memcg_kmem_uncharge_pages: uncharge pages from memcg
- * @page: pointer to struct page being freed
- * @order: allocation order.
+ * memcg_kmem_charge: charge a kmem page
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ *
+ * Returns 0 on success, an error code on failure.
*/
-static inline void
-memcg_kmem_uncharge_pages(struct page *page, int order)
+static __always_inline int memcg_kmem_charge(struct page *page,
+ gfp_t gfp, int order)
{
- if (memcg_kmem_enabled())
- __memcg_kmem_uncharge_pages(page, order);
+ if (__memcg_kmem_bypass(gfp))
+ return 0;
+ return __memcg_kmem_charge(page, gfp, order);
}
/**
- * memcg_kmem_commit_charge: embeds correct memcg in a page
- * @page: pointer to struct page recently allocated
- * @memcg: the memcg structure we charged against
- * @order: allocation order.
- *
- * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
- * failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit @page to @memcg.
+ * memcg_kmem_uncharge: uncharge a kmem page
+ * @page: page to uncharge
+ * @order: allocation order
*/
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
{
- if (memcg_kmem_enabled() && memcg)
- __memcg_kmem_commit_charge(page, memcg, order);
+ if (memcg_kmem_enabled())
+ __memcg_kmem_uncharge(page, order);
}
/**
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
- if (!memcg_kmem_enabled())
+ if (__memcg_kmem_bypass(gfp))
return cachep;
- if (gfp & __GFP_NOACCOUNT)
- return cachep;
- if (gfp & __GFP_NOFAIL)
- return cachep;
- if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
- return cachep;
- if (unlikely(fatal_signal_pending(current)))
- return cachep;
-
return __memcg_kmem_get_cache(cachep);
}
if (memcg_kmem_enabled())
__memcg_kmem_put_cache(cachep);
}
-
-static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
-{
- if (!memcg_kmem_enabled())
- return NULL;
- return __mem_cgroup_from_kmem(ptr);
-}
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
return false;
}
-static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
- return true;
+ return 0;
}
-static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
-{
-}
-
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static inline void memcg_kmem_uncharge(struct page *page, int order)
{
}
static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
}
-
-static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
-{
- return NULL;
-}
#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
-
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
+/* This mask is used to clear all the VMA flags used by mlock */
+#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
extern void kvfree(const void *addr);
-static inline void compound_lock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline void compound_unlock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(PageSlab(page), page);
- bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline unsigned long compound_lock_irqsave(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
- unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- local_irq_save(flags);
- compound_lock(page);
-#endif
- return flags;
+ return &page[1].compound_mapcount;
}
-static inline void compound_unlock_irqrestore(struct page *page,
- unsigned long flags)
+static inline int compound_mapcount(struct page *page)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- compound_unlock(page);
- local_irq_restore(flags);
-#endif
-}
-
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
- struct page *head = tail->first_page;
-
- /*
- * page->first_page may be a dangling pointer to an old
- * compound page, so recheck that it is still a tail
- * page before returning.
- */
- smp_rmb();
- if (likely(PageTail(tail)))
- return head;
- return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return compound_head_by_tail(page);
- return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
+ if (!PageCompound(page))
+ return 0;
+ page = compound_head(page);
+ return atomic_read(compound_mapcount_ptr(page)) + 1;
}
/*
static inline int page_mapcount(struct page *page)
{
+ int ret;
VM_BUG_ON_PAGE(PageSlab(page), page);
- return atomic_read(&page->_mapcount) + 1;
+
+ ret = atomic_read(&page->_mapcount) + 1;
+ if (PageCompound(page)) {
+ page = compound_head(page);
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ }
+ return ret;
}
static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count);
}
-static inline bool __compound_tail_refcounted(struct page *page)
-{
- return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
-}
-
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- return __compound_tail_refcounted(page);
-}
-
-static inline void get_huge_page_tail(struct page *page)
-{
- /*
- * __split_huge_page_refcount() cannot run from under us.
- */
- VM_BUG_ON_PAGE(!PageTail(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- if (compound_tail_refcounted(page->first_page))
- atomic_inc(&page->_mapcount);
-}
-
-extern bool __get_page_tail(struct page *page);
-
static inline void get_page(struct page *page)
{
- if (unlikely(PageTail(page)))
- if (likely(__get_page_tail(page)))
- return;
+ page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
{
struct page *page = virt_to_page(x);
- /*
- * We don't need to worry about synchronization of tail flag
- * when we call virt_to_head_page() since it is only called for
- * already allocated page and this page won't be freed until
- * this virt_to_head_page() is finished. So use _fast variant.
- */
- return compound_head_fast(page);
+ return compound_head(page);
}
/*
atomic_set(&page->_count, 1);
}
-void put_page(struct page *page);
+void __put_page(struct page *page);
+
+static inline void put_page(struct page *page)
+{
+ page = compound_head(page);
+ if (put_page_testzero(page))
+ __put_page(page);
+}
+
void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
/*
* Compound pages have a destructor function. Provide a
* prototype for that function and accessor functions.
- * These are _only_ valid on the head of a PG_compound page.
+ * These are _only_ valid on the head of a compound page.
*/
+typedef void compound_page_dtor(struct page *);
+
+/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
+enum compound_dtor_id {
+ NULL_COMPOUND_DTOR,
+ COMPOUND_PAGE_DTOR,
+#ifdef CONFIG_HUGETLB_PAGE
+ HUGETLB_PAGE_DTOR,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ TRANSHUGE_PAGE_DTOR,
+#endif
+ NR_COMPOUND_DTORS,
+};
+extern compound_page_dtor * const compound_page_dtors[];
static inline void set_compound_page_dtor(struct page *page,
- compound_page_dtor *dtor)
+ enum compound_dtor_id compound_dtor)
{
- page[1].compound_dtor = dtor;
+ VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
+ page[1].compound_dtor = compound_dtor;
}
static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
{
- return page[1].compound_dtor;
+ VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
+ return compound_page_dtors[page[1].compound_dtor];
}
-static inline int compound_order(struct page *page)
+static inline unsigned int compound_order(struct page *page)
{
if (!PageHead(page))
return 0;
return page[1].compound_order;
}
-static inline void set_compound_order(struct page *page, unsigned long order)
+static inline void set_compound_order(struct page *page, unsigned int order)
{
page[1].compound_order = order;
}
+void free_compound_page(struct page *page);
+
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
/*
* Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
*/
-static inline int page_mapped(struct page *page)
-{
- return atomic_read(&(page)->_mapcount) >= 0;
+static inline bool page_mapped(struct page *page)
+{
+ int i;
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ for (i = 0; i < hpage_nr_pages(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
}
/*
* with 0. Make sure nobody took it in use in between.
*
* It can happen if arch try to use slab for page table allocation:
- * slab code uses page->slab_cache and page->first_page (for tail
- * pages), which share storage with page->ptl.
+ * slab code uses page->slab_cache, which share storage with page->ptl.
*/
VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
if (!ptlock_alloc(page))
static inline bool pgtable_page_ctor(struct page *page)
{
+ if (!ptlock_init(page))
+ return false;
inc_zone_page_state(page, NR_PAGETABLE);
- return ptlock_init(page);
+ return true;
}
static inline void pgtable_page_dtor(struct page *page)
extern void si_meminfo_node(struct sysinfo *val, int nid);
extern __printf(3, 4)
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
+ const char *fmt, ...);
extern void setup_per_cpu_pageset(void);
pgoff_t offset,
unsigned long size);
-unsigned long max_sane_readahead(unsigned long nr);
-
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
+#define FOLL_MLOCK 0x1000 /* lock present pages */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
-typedef void compound_page_dtor(struct page *);
-
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* see PAGE_MAPPING_ANON below.
*/
void *s_mem; /* slab first object */
+ atomic_t compound_mapcount; /* first tail page */
+ /* page_deferred_list().next -- second tail page */
};
/* Second double word */
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+ /* page_deferred_list().prev -- second tail page */
};
union {
union {
/*
- * Count of ptes mapped in
- * mms, to show when page is
- * mapped & limit reverse map
- * searches.
- *
- * Used also for tail pages
- * refcounting instead of
- * _count. Tail pages cannot
- * be mapped and keeping the
- * tail page _count zero at
- * all times guarantees
- * get_page_unless_zero() will
- * never succeed on tail
- * pages.
+ * Count of ptes mapped in mms, to show
+ * when page is mapped & limit reverse
+ * map searches.
*/
atomic_t _mapcount;
};
};
- /* Third double word block */
+ /*
+ * Third double word block
+ *
+ * WARNING: bit 0 of the first word encode PageTail(). That means
+ * the rest users of the storage space MUST NOT use the bit to
+ * avoid collision and false-positive PageTail().
+ */
union {
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
#endif
};
- struct slab *slab_page; /* slab fields */
struct rcu_head rcu_head; /* Used by SLAB
* when destroying via RCU
*/
- /* First tail page of compound page */
+ /* Tail pages of compound page */
struct {
- compound_page_dtor *compound_dtor;
- unsigned long compound_order;
+ unsigned long compound_head; /* If bit zero is set */
+
+ /* First tail page only */
+#ifdef CONFIG_64BIT
+ /*
+ * On 64 bit system we have enough space in struct page
+ * to encode compound_dtor and compound_order with
+ * unsigned int. It can help compiler generate better or
+ * smaller code on some archtectures.
+ */
+ unsigned int compound_dtor;
+ unsigned int compound_order;
+#else
+ unsigned short int compound_dtor;
+ unsigned short int compound_order;
+#endif
};
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
- pgtable_t pmd_huge_pte; /* protected by page->ptl */
+ struct {
+ unsigned long __pad; /* do not overlay pmd_huge_pte
+ * with compound_head to avoid
+ * possible bit 0 collision.
+ */
+ pgtable_t pmd_huge_pte; /* protected by page->ptl */
+ };
#endif
};
#endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
- struct page *first_page; /* Compound tail pages */
};
#ifdef CONFIG_MEMCG
/* address of the bounds directory */
void __user *bd_addr;
#endif
+#ifdef CONFIG_HUGETLB_PAGE
+ atomic_long_t hugetlb_usage;
+#endif
};
static inline void mm_init_cpumask(struct mm_struct *mm)
#define VIRTUAL_BUG_ON(cond) do { } while (0)
#endif
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
+
#endif
enum {
MIGRATE_UNMOVABLE,
- MIGRATE_RECLAIMABLE,
MIGRATE_MOVABLE,
+ MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
- MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];
+ unsigned long nr_reserved_highatomic;
+
/*
- * We don't know if the memory that we're going to allocate will be freeable
- * or/and it will be released eventually, so to avoid totally wasting several
- * GB of ram we must reserve some of the lower zone memory (otherwise we risk
- * to run OOM on the lower zones despite there's tons of freeable ram
- * on the higher zones). This array is recalculated at runtime if the
- * sysctl_lowmem_reserve_ratio sysctl changes.
+ * We don't know if the memory that we're going to allocate will be
+ * freeable or/and it will be released eventually, so to avoid totally
+ * wasting several GB of ram we must reserve some of the lower zone
+ * memory (otherwise we risk to run OOM on the lower zones despite
+ * there being tons of freeable ram on the higher zones). This array is
+ * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
+ * changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
const char *name;
- /*
- * Number of MIGRATE_RESERVE page block. To maintain for just
- * optimization. Protected by zone->lock.
- */
- int nr_migrate_reserve_block;
-
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* [1] : No fallback (__GFP_THISNODE)
*/
#define MAX_ZONELISTS 2
-
-
-/*
- * We cache key information from each zonelist for smaller cache
- * footprint when scanning for free pages in get_page_from_freelist().
- *
- * 1) The BITMAP fullzones tracks which zones in a zonelist have come
- * up short of free memory since the last time (last_fullzone_zap)
- * we zero'd fullzones.
- * 2) The array z_to_n[] maps each zone in the zonelist to its node
- * id, so that we can efficiently evaluate whether that node is
- * set in the current tasks mems_allowed.
- *
- * Both fullzones and z_to_n[] are one-to-one with the zonelist,
- * indexed by a zones offset in the zonelist zones[] array.
- *
- * The get_page_from_freelist() routine does two scans. During the
- * first scan, we skip zones whose corresponding bit in 'fullzones'
- * is set or whose corresponding node in current->mems_allowed (which
- * comes from cpusets) is not set. During the second scan, we bypass
- * this zonelist_cache, to ensure we look methodically at each zone.
- *
- * Once per second, we zero out (zap) fullzones, forcing us to
- * reconsider nodes that might have regained more free memory.
- * The field last_full_zap is the time we last zapped fullzones.
- *
- * This mechanism reduces the amount of time we waste repeatedly
- * reexaming zones for free memory when they just came up low on
- * memory momentarilly ago.
- *
- * The zonelist_cache struct members logically belong in struct
- * zonelist. However, the mempolicy zonelists constructed for
- * MPOL_BIND are intentionally variable length (and usually much
- * shorter). A general purpose mechanism for handling structs with
- * multiple variable length members is more mechanism than we want
- * here. We resort to some special case hackery instead.
- *
- * The MPOL_BIND zonelists don't need this zonelist_cache (in good
- * part because they are shorter), so we put the fixed length stuff
- * at the front of the zonelist struct, ending in a variable length
- * zones[], as is needed by MPOL_BIND.
- *
- * Then we put the optional zonelist cache on the end of the zonelist
- * struct. This optional stuff is found by a 'zlcache_ptr' pointer in
- * the fixed length portion at the front of the struct. This pointer
- * both enables us to find the zonelist cache, and in the case of
- * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
- * to know that the zonelist cache is not there.
- *
- * The end result is that struct zonelists come in two flavors:
- * 1) The full, fixed length version, shown below, and
- * 2) The custom zonelists for MPOL_BIND.
- * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
- *
- * Even though there may be multiple CPU cores on a node modifying
- * fullzones or last_full_zap in the same zonelist_cache at the same
- * time, we don't lock it. This is just hint data - if it is wrong now
- * and then, the allocator will still function, perhaps a bit slower.
- */
-
-
-struct zonelist_cache {
- unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
- DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
- unsigned long last_full_zap; /* when last zap'd (jiffies) */
-};
#else
#define MAX_ZONELISTS 1
-struct zonelist_cache;
#endif
/*
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
- * If zlcache_ptr is not NULL, then it is just the address of zlcache,
- * as explained above. If zlcache_ptr is NULL, there is no zlcache.
- * *
* To speed the reading of the zonelist, the zonerefs contain the zone index
* of the entry being read. Helper functions to access information given
* a struct zoneref are
* zonelist_node_idx() - Return the index of the node for an entry
*/
struct zonelist {
- struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
-#ifdef CONFIG_NUMA
- struct zonelist_cache zlcache; // optional ...
-#endif
};
#ifndef CONFIG_DISCONTIGMEM
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags);
+ unsigned long mark, int classzone_idx);
enum memmap_context {
MEMMAP_EARLY,
MEMMAP_HOTPLUG,
};
extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long size,
- enum memmap_context context);
+ unsigned long size);
extern void lruvec_init(struct lruvec *lruvec);
extern const struct kernel_param_ops param_ops_charp;
extern int param_set_charp(const char *val, const struct kernel_param *kp);
extern int param_get_charp(char *buffer, const struct kernel_param *kp);
+extern void param_free_charp(void *arg);
#define param_check_charp(name, p) __param_check(name, p, char *)
/* We used to allow int as well as bool. We're taking that away! */
extern int watchdog_thresh;
extern unsigned long *watchdog_cpumask_bits;
extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
PG_head, /* A head page */
- PG_tail, /* A tail page */
-#else
- PG_compound, /* A compound page */
-#endif
PG_swapcache, /* Swap page: swp_entry_t in private */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- PG_compound_lock,
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
/* SLOB */
PG_slob_free = PG_private,
+
+ /* Compound pages. Stored in first tail page's flags */
+ PG_double_map = PG_private_2,
};
#ifndef __GENERATING_BOUNDS_H
+struct page; /* forward declaration */
+
+static inline struct page *compound_head(struct page *page)
+{
+ unsigned long head = READ_ONCE(page->compound_head);
+
+ if (unlikely(head & 1))
+ return (struct page *) (head - 1);
+ return page;
+}
+
+static inline int PageTail(struct page *page)
+{
+ return READ_ONCE(page->compound_head) & 1;
+}
+
+static inline int PageCompound(struct page *page)
+{
+ return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ * the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ * for compound page all operations related to the page flag applied to
+ * head page.
+ *
+ * PF_NO_TAIL:
+ * modifications of the page flag must be done on small or head pages,
+ * checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ * the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
+ compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
+ TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
* Must use a macro here due to header dependency issues. page_zone() is not
* available at this point.
*/
-#define PageHighMem(__p) is_highmem(page_zone(__p))
+#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#else
PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
-SETPAGEFLAG(Young, young)
-TESTCLEARFLAG(Young, young)
-PAGEFLAG(Idle, idle)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
#endif
/*
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
test_set_page_writeback_keepwrite(page);
}
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-/*
- * System with lots of page flags available. This allows separate
- * flags for PageHead() and PageTail() checks of compound pages so that bit
- * tests can be used in performance sensitive paths. PageCompound is
- * generally not used in hot code paths except arch/powerpc/mm/init_64.c
- * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
- * and avoid handling those in real mode.
- */
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
-
-static inline int PageCompound(struct page *page)
-{
- return page->flags & ((1L << PG_head) | (1L << PG_tail));
-
-}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline void ClearPageCompound(struct page *page)
-{
- BUG_ON(!PageHead(page));
- ClearPageHead(page);
-}
-#endif
-
-#define PG_head_mask ((1L << PG_head))
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
-#else
-/*
- * Reduce page flag use as much as possible by overlapping
- * compound page flags with the flags used for page cache pages. Possible
- * because PageCompound is always set for compound pages and not for
- * pages on the LRU and/or pagecache.
- */
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound)
-
-/*
- * PG_reclaim is used in combination with PG_compound to mark the
- * head and tail of a compound page. This saves one page flag
- * but makes it impossible to use compound pages for the page cache.
- * The PG_reclaim bit would have to be used for reclaim or readahead
- * if compound pages enter the page cache.
- *
- * PG_compound & PG_reclaim => Tail page
- * PG_compound & ~PG_reclaim => Head page
- */
-#define PG_head_mask ((1L << PG_compound))
-#define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim))
-
-static inline int PageHead(struct page *page)
-{
- return ((page->flags & PG_head_tail_mask) == PG_head_mask);
-}
-
-static inline int PageTail(struct page *page)
+static inline void set_compound_head(struct page *page, struct page *head)
{
- return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask);
+ WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}
-static inline void __SetPageTail(struct page *page)
+static inline void clear_compound_head(struct page *page)
{
- page->flags |= PG_head_tail_mask;
-}
-
-static inline void __ClearPageTail(struct page *page)
-{
- page->flags &= ~PG_head_tail_mask;
+ WRITE_ONCE(page->compound_head, 0);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
- BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
- clear_bit(PG_compound, &page->flags);
+ BUG_ON(!PageHead(page));
+ ClearPageHead(page);
}
#endif
-#endif /* !PAGEFLAGS_EXTENDED */
+#define PG_head_mask ((1L << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
return PageTail(page);
}
-#else
-
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
+ * as PMDs.
+ *
+ * This is required for optimization of rmap oprations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_set_bit(PG_double_map, &page[1].flags);
}
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
{
- return 0;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ return test_and_clear_bit(PG_double_map, &page[1].flags);
}
+
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+ TESTSETFLAG_FALSE(DoubleMap)
+ TESTCLEARFLAG_FALSE(DoubleMap)
#endif
/*
#define __PG_MLOCKED 0
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK 0
-#endif
-
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
1 << PG_private | 1 << PG_private_2 | \
1 << PG_writeback | 1 << PG_reserved | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
- 1 << PG_unevictable | __PG_MLOCKED | \
- __PG_COMPOUND_LOCK)
+ 1 << PG_unevictable | __PG_MLOCKED)
/*
* Flags checked when a page is prepped for return by the page allocator.
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Huge page sizes are variable */
-extern int pageblock_order;
+extern unsigned int pageblock_order;
#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
}
+/* Restricts the given gfp_mask to what the mapping allows. */
+static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
+ gfp_t gfp_mask)
+{
+ return mapping_gfp_mask(mapping) & gfp_mask;
+}
+
/*
* This is non-atomic. Only to be used before the mapping is activated.
* Probably needs a barrier...
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
+ pgoff_t pgoff;
+
if (unlikely(PageHeadHuge(page)))
return page->index << compound_order(page);
- else
+
+ if (likely(!PageTransTail(page)))
return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+ /*
+ * We don't initialize ->index for tail pages: calculate based on
+ * head page
+ */
+ pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff += page - compound_head(page);
+ return pgoff;
}
/*
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
* Magic number "tsta" to indicate a static timer initializer
* for the object debugging code.
*/
-#define TIMER_ENTRY_STATIC ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
#define MUTEX_DEBUG_INIT 0x11
#define MUTEX_DEBUG_FREE 0x22
+/********** lib/dma_debug.c **********/
+#define DMA_ALLOC_POISON 0xee
+
/********** lib/flex_array.c **********/
#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */
})
/**
- * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
- * given type safe against removal of rb_node entry
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
*
* @pos: the 'type *' to use as a loop cursor.
* @n: another 'type *' to use as temporary storage
* @root: 'rb_root *' of the rbtree.
* @field: the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
*/
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
TTU_UNMAP = 1, /* unmap mode */
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
+ TTU_FREE = 8, /* free mode */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
struct anon_vma *page_get_anon_vma(struct page *page);
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
+
/*
* rmap interfaces called when adding or removing pte of page
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
{
- atomic_inc(&page->_mapcount);
+ atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}
/*
void __user *buffer,
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
+extern unsigned int hardlockup_panic;
void lockup_detector_init(void);
#else
static inline void touch_softlockup_watchdog(void)
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+ unsigned memcg_may_oom:1;
+#endif
#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif
unsigned long sas_ss_sp;
size_t sas_ss_size;
- int (*notifier)(void *priv);
- void *notifier_data;
- sigset_t *notifier_mask;
+
struct callback_head *task_works;
struct audit_context *audit_context;
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG
- struct memcg_oom_info {
- struct mem_cgroup *memcg;
- gfp_t gfp_mask;
- int order;
- unsigned int may_oom:1;
- } memcg_oom;
+ struct mem_cgroup *memcg_in_oom;
+ gfp_t memcg_oom_gfp_mask;
+ int memcg_oom_order;
+
+ /* number of pages to reclaim on returning to userland */
+ unsigned int memcg_nr_pages_over_high;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
-static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+static inline int kernel_dequeue_signal(siginfo_t *info)
{
- unsigned long flags;
+ struct task_struct *tsk = current;
+ siginfo_t __info;
int ret;
- spin_lock_irqsave(&tsk->sighand->siglock, flags);
- ret = dequeue_signal(tsk, mask, info);
- spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ spin_lock_irq(&tsk->sighand->siglock);
+ ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+ spin_unlock_irq(&tsk->sighand->siglock);
return ret;
}
-extern void block_all_signals(int (*notifier)(void *priv), void *priv,
- sigset_t *mask);
-extern void unblock_all_signals(void);
+static inline void kernel_signal_stop(void)
+{
+ spin_lock_irq(¤t->sighand->siglock);
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ __set_current_state(TASK_STOPPED);
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ schedule();
+}
+
extern void release_task(struct task_struct * p);
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
extern int force_sigsegv(int, struct task_struct *);
static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
- might_sleep_if(pri & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_cloned(skb))
return pskb_expand_head(skb, 0, 0, pri);
*/
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
- might_sleep_if(pri & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_shared(skb)) {
struct sk_buff *nskb = skb_clone(skb, pri);
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
gfp_t pri)
{
- might_sleep_if(pri & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, pri);
* struct kmem_cache related prototypes
*/
void __init kmem_cache_init(void);
-int slab_is_available(void);
+bool slab_is_available(void);
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif
+/*
+ * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
+ * Intended for arches that get misalignment faults even for 64 bit integer
+ * aligned buffers.
+ */
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+/*
+ * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
+ * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
+ * aligned pointers.
+ */
+#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
+#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
+#define __assume_page_alignment __assume_aligned(PAGE_SIZE)
+
/*
* Kmalloc array related definitions
*/
}
#endif /* !CONFIG_SLOB */
-void *__kmalloc(size_t size, gfp_t flags);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
void kmem_cache_free(struct kmem_cache *, void *);
/*
bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
#endif
#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node, size_t size);
+ int node, size_t size) __assume_slab_alignment;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
return __kmalloc_node(size, flags, node);
}
-/*
- * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
- * Intended for arches that get misalignment faults even for 64 bit integer
- * aligned buffers.
- */
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
struct memcg_cache_array {
struct rcu_head rcu;
struct kmem_cache *entries[0];
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
};
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/*
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
return 0;
}
-#define reuse_swap_page(page) (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+ (!PageTransCompound(page) && page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
{
asmlinkage long sys_membarrier(int cmd, int flags);
+asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
+
#endif
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
+#include <linux/memcontrol.h>
struct linux_binprm;
/*
smp_mb__after_atomic();
if (unlikely(current->task_works))
task_work_run();
+
+ mem_cgroup_handle_over_high();
}
#endif /* <linux/tracehook.h> */
* struct callback_head - callback structure for use with RCU and task_work
* @next: next update requests in a list
* @func: actual update function to call after the grace period.
+ *
+ * The struct is aligned to size of pointer. On most architectures it happens
+ * naturally due ABI requirements, but some architectures (like CRIS) have
+ * weird ABI and we need to ask it explicitly.
+ *
+ * The alignment is required to guarantee that bits 0 and 1 of @next will be
+ * clear under normal conditions -- as long as we use call_rcu(),
+ * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
+ *
+ * This guarantee is important for few reasons:
+ * - future call_rcu_lazy() will make use of lower bits in the pointer;
+ * - the structure shares storage spacer in struct page with @compound_head,
+ * which encode PageTail() in bit 0. The guarantee is needed to avoid
+ * false-positive PageTail().
*/
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
-};
+} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head
typedef void (*rcu_callback_t)(struct rcu_head *head);
#endif /* ARCH_HAS_NOCACHE_UACCESS */
-/**
- * probe_kernel_address(): safely attempt to read from a location
- * @addr: address to read from - its type is type typeof(retval)*
- * @retval: read into this variable
- *
- * Safely read from address @addr into variable @revtal. If a kernel fault
- * happens, handle that and return -EFAULT.
- * We ensure that the __get_user() is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_sem. This makes
- * probe_kernel_address() suitable for use within regions where the caller
- * already holds mmap_sem, or other locks which nest inside mmap_sem.
- * This must be a macro because __get_user() needs to know the types of the
- * args.
- *
- * We don't include enough header files to be able to do the set_fs(). We
- * require that the probe_kernel_address() caller will do that.
- */
-#define probe_kernel_address(addr, retval) \
- ({ \
- long ret; \
- mm_segment_t old_fs = get_fs(); \
- \
- set_fs(KERNEL_DS); \
- pagefault_disable(); \
- ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
- pagefault_enable(); \
- set_fs(old_fs); \
- ret; \
- })
-
/*
* probe_kernel_read(): safely attempt to read from a location
* @dst: pointer to the buffer that shall take the data
extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+/**
+ * probe_kernel_address(): safely attempt to read from a location
+ * @addr: address to read from
+ * @retval: read into this variable
+ *
+ * Returns 0 on success, or -EFAULT.
+ */
+#define probe_kernel_address(addr, retval) \
+ probe_kernel_read(&retval, addr, sizeof(retval))
+
#endif /* __LINUX_UACCESS_H__ */
#endif
#ifdef CONFIG_HIGHMEM
-#define HIGHMEM_ZONE(xx) , xx##_HIGH
+#define HIGHMEM_ZONE(xx) xx##_HIGH,
#else
#define HIGHMEM_ZONE(xx)
#endif
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
+ PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
THP_FAULT_FALLBACK,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
- THP_SPLIT,
+ THP_SPLIT_PAGE,
+ THP_SPLIT_PAGE_FAILED,
+ THP_SPLIT_PMD,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
#endif
}
#ifdef CONFIG_NUMA
-/*
- * Determine the per node value of a stat item. This function
- * is called frequently in a NUMA machine, so try to be as
- * frugal as possible.
- */
-static inline unsigned long node_page_state(int node,
- enum zone_stat_item item)
-{
- struct zone *zones = NODE_DATA(node)->node_zones;
-
- return
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
- zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], item) +
- zone_page_state(&zones[ZONE_MOVABLE], item);
-}
+extern unsigned long node_page_state(int node, enum zone_stat_item item);
extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
#else
bool zpool_has_pool(char *type);
-struct zpool *zpool_create_pool(char *type, char *name,
+struct zpool *zpool_create_pool(const char *type, const char *name,
gfp_t gfp, const struct zpool_ops *ops);
-char *zpool_get_type(struct zpool *pool);
+const char *zpool_get_type(struct zpool *pool);
void zpool_destroy_pool(struct zpool *pool);
atomic_t refcount;
struct list_head list;
- void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
+ void *(*create)(const char *name,
+ gfp_t gfp,
+ const struct zpool_ops *ops,
struct zpool *zpool);
void (*destroy)(void *pool);
struct zs_pool;
-struct zs_pool *zs_create_pool(char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size);
An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
much faster. Usage example:
- uLong adler = adler32(0L, NULL, 0);
+ uLong adler = zlib_adler32(0L, NULL, 0);
while (read_buffer(buffer, length) != EOF) {
- adler = adler32(adler, buffer, length);
+ adler = zlib_adler32(adler, buffer, length);
}
if (adler != original_adler) error();
*/
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
- if (sk->sk_allocation & __GFP_WAIT)
+ if (gfpflags_allow_blocking(sk->sk_allocation))
return ¤t->task_frag;
return &sk->sk_frag;
#include <linux/tracepoint.h>
#include <trace/events/gfpflags.h>
+#define COMPACTION_STATUS \
+ EM( COMPACT_DEFERRED, "deferred") \
+ EM( COMPACT_SKIPPED, "skipped") \
+ EM( COMPACT_CONTINUE, "continue") \
+ EM( COMPACT_PARTIAL, "partial") \
+ EM( COMPACT_COMPLETE, "complete") \
+ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \
+ EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \
+ EMe(COMPACT_CONTENDED, "contended")
+
+#ifdef CONFIG_ZONE_DMA
+#define IFDEF_ZONE_DMA(X) X
+#else
+#define IFDEF_ZONE_DMA(X)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define IFDEF_ZONE_DMA32(X) X
+#else
+#define IFDEF_ZONE_DMA32(X)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define IFDEF_ZONE_HIGHMEM(X) X
+#else
+#define IFDEF_ZONE_HIGHMEM(X)
+#endif
+
+#define ZONE_TYPE \
+ IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \
+ IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \
+ EM (ZONE_NORMAL, "Normal") \
+ IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \
+ EMe(ZONE_MOVABLE,"Movable")
+
+/*
+ * First define the enums in the above macros to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+COMPACTION_STATUS
+ZONE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
TP_PROTO(
__entry->free_pfn,
__entry->zone_end,
__entry->sync ? "sync" : "async",
- compaction_status_string[__entry->status])
+ __print_symbolic(__entry->status, COMPACTION_STATUS))
);
TRACE_EVENT(mm_compaction_try_to_compact_pages,
TP_STRUCT__entry(
__field(int, nid)
- __field(char *, name)
+ __field(enum zone_type, idx)
__field(int, order)
__field(int, ret)
),
TP_fast_assign(
__entry->nid = zone_to_nid(zone);
- __entry->name = (char *)zone->name;
+ __entry->idx = zone_idx(zone);
__entry->order = order;
__entry->ret = ret;
),
TP_printk("node=%d zone=%-8s order=%d ret=%s",
__entry->nid,
- __entry->name,
+ __print_symbolic(__entry->idx, ZONE_TYPE),
__entry->order,
- compaction_status_string[__entry->ret])
+ __print_symbolic(__entry->ret, COMPACTION_STATUS))
);
DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
TP_STRUCT__entry(
__field(int, nid)
- __field(char *, name)
+ __field(enum zone_type, idx)
__field(int, order)
__field(unsigned int, considered)
__field(unsigned int, defer_shift)
TP_fast_assign(
__entry->nid = zone_to_nid(zone);
- __entry->name = (char *)zone->name;
+ __entry->idx = zone_idx(zone);
__entry->order = order;
__entry->considered = zone->compact_considered;
__entry->defer_shift = zone->compact_defer_shift;
TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
__entry->nid,
- __entry->name,
+ __print_symbolic(__entry->idx, ZONE_TYPE),
__entry->order,
__entry->order_failed,
__entry->considered,
{(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
{(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
{(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
- {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \
+ {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \
{(unsigned long)__GFP_IO, "GFP_IO"}, \
{(unsigned long)__GFP_COLD, "GFP_COLD"}, \
{(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
- {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
+ {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \
+ {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \
{(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
) : "GFP_NOWAIT"
--- /dev/null
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM huge_memory
+
+#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HUGE_MEMORY_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/events/gfpflags.h>
+
+#define SCAN_STATUS \
+ EM( SCAN_FAIL, "failed") \
+ EM( SCAN_SUCCEED, "succeeded") \
+ EM( SCAN_PMD_NULL, "pmd_null") \
+ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
+ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
+ EM( SCAN_PAGE_RO, "no_writable_page") \
+ EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \
+ EM( SCAN_PAGE_NULL, "page_null") \
+ EM( SCAN_SCAN_ABORT, "scan_aborted") \
+ EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
+ EM( SCAN_PAGE_LRU, "page_not_in_lru") \
+ EM( SCAN_PAGE_LOCK, "page_locked") \
+ EM( SCAN_PAGE_ANON, "page_not_anon") \
+ EM( SCAN_PAGE_COMPOUND, "page_compound") \
+ EM( SCAN_ANY_PROCESS, "no_process_for_page") \
+ EM( SCAN_VMA_NULL, "vma_null") \
+ EM( SCAN_VMA_CHECK, "vma_check_failed") \
+ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
+ EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
+ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
+ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EMe( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte")
+
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+SCAN_STATUS
+
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(mm_khugepaged_scan_pmd,
+
+ TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+ bool referenced, int none_or_zero, int status, int unmapped),
+
+ TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status, unmapped),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, pfn)
+ __field(bool, writable)
+ __field(bool, referenced)
+ __field(int, none_or_zero)
+ __field(int, status)
+ __field(int, unmapped)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->pfn = pfn;
+ __entry->writable = writable;
+ __entry->referenced = referenced;
+ __entry->none_or_zero = none_or_zero;
+ __entry->status = status;
+ __entry->unmapped = unmapped;
+ ),
+
+ TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
+ __entry->mm,
+ __entry->pfn,
+ __entry->writable,
+ __entry->referenced,
+ __entry->none_or_zero,
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->unmapped)
+);
+
+TRACE_EVENT(mm_collapse_huge_page,
+
+ TP_PROTO(struct mm_struct *mm, int isolated, int status),
+
+ TP_ARGS(mm, isolated, status),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, isolated)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->isolated = isolated;
+ __entry->status = status;
+ ),
+
+ TP_printk("mm=%p, isolated=%d, status=%s",
+ __entry->mm,
+ __entry->isolated,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_isolate,
+
+ TP_PROTO(unsigned long pfn, int none_or_zero,
+ bool referenced, bool writable, int status),
+
+ TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(int, none_or_zero)
+ __field(bool, referenced)
+ __field(bool, writable)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->none_or_zero = none_or_zero;
+ __entry->referenced = referenced;
+ __entry->writable = writable;
+ __entry->status = status;
+ ),
+
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
+ __entry->pfn,
+ __entry->none_or_zero,
+ __entry->referenced,
+ __entry->writable,
+ __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+ TP_ARGS(mm, swapped_in, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swapped_in)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swapped_in = swapped_in;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swapped_in=%d, ret=%d",
+ __entry->mm,
+ __entry->swapped_in,
+ __entry->ret)
+);
+
+#endif /* __HUGE_MEMORY_H */
+#include <trace/define_trace.h>
+
--- /dev/null
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nilfs2
+
+#if !defined(_TRACE_NILFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NILFS2_H
+
+#include <linux/tracepoint.h>
+
+struct nilfs_sc_info;
+
+#define show_collection_stage(type) \
+ __print_symbolic(type, \
+ { NILFS_ST_INIT, "ST_INIT" }, \
+ { NILFS_ST_GC, "ST_GC" }, \
+ { NILFS_ST_FILE, "ST_FILE" }, \
+ { NILFS_ST_IFILE, "ST_IFILE" }, \
+ { NILFS_ST_CPFILE, "ST_CPFILE" }, \
+ { NILFS_ST_SUFILE, "ST_SUFILE" }, \
+ { NILFS_ST_DAT, "ST_DAT" }, \
+ { NILFS_ST_SR, "ST_SR" }, \
+ { NILFS_ST_DSYNC, "ST_DSYNC" }, \
+ { NILFS_ST_DONE, "ST_DONE"})
+
+TRACE_EVENT(nilfs2_collection_stage_transition,
+
+ TP_PROTO(struct nilfs_sc_info *sci),
+
+ TP_ARGS(sci),
+
+ TP_STRUCT__entry(
+ __field(void *, sci)
+ __field(int, stage)
+ ),
+
+ TP_fast_assign(
+ __entry->sci = sci;
+ __entry->stage = sci->sc_stage.scnt;
+ ),
+
+ TP_printk("sci = %p stage = %s",
+ __entry->sci,
+ show_collection_stage(__entry->stage))
+);
+
+#ifndef TRACE_HEADER_MULTI_READ
+enum nilfs2_transaction_transition_state {
+ TRACE_NILFS2_TRANSACTION_BEGIN,
+ TRACE_NILFS2_TRANSACTION_COMMIT,
+ TRACE_NILFS2_TRANSACTION_ABORT,
+ TRACE_NILFS2_TRANSACTION_TRYLOCK,
+ TRACE_NILFS2_TRANSACTION_LOCK,
+ TRACE_NILFS2_TRANSACTION_UNLOCK,
+};
+#endif
+
+#define show_transaction_state(type) \
+ __print_symbolic(type, \
+ { TRACE_NILFS2_TRANSACTION_BEGIN, "BEGIN" }, \
+ { TRACE_NILFS2_TRANSACTION_COMMIT, "COMMIT" }, \
+ { TRACE_NILFS2_TRANSACTION_ABORT, "ABORT" }, \
+ { TRACE_NILFS2_TRANSACTION_TRYLOCK, "TRYLOCK" }, \
+ { TRACE_NILFS2_TRANSACTION_LOCK, "LOCK" }, \
+ { TRACE_NILFS2_TRANSACTION_UNLOCK, "UNLOCK" })
+
+TRACE_EVENT(nilfs2_transaction_transition,
+ TP_PROTO(struct super_block *sb,
+ struct nilfs_transaction_info *ti,
+ int count,
+ unsigned int flags,
+ enum nilfs2_transaction_transition_state state),
+
+ TP_ARGS(sb, ti, count, flags, state),
+
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, ti)
+ __field(int, count)
+ __field(unsigned int, flags)
+ __field(int, state)
+ ),
+
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->ti = ti;
+ __entry->count = count;
+ __entry->flags = flags;
+ __entry->state = state;
+ ),
+
+ TP_printk("sb = %p ti = %p count = %d flags = %x state = %s",
+ __entry->sb,
+ __entry->ti,
+ __entry->count,
+ __entry->flags,
+ show_transaction_state(__entry->state))
+);
+
+TRACE_EVENT(nilfs2_segment_usage_check,
+ TP_PROTO(struct inode *sufile,
+ __u64 segnum,
+ unsigned long cnt),
+
+ TP_ARGS(sufile, segnum, cnt),
+
+ TP_STRUCT__entry(
+ __field(struct inode *, sufile)
+ __field(__u64, segnum)
+ __field(unsigned long, cnt)
+ ),
+
+ TP_fast_assign(
+ __entry->sufile = sufile;
+ __entry->segnum = segnum;
+ __entry->cnt = cnt;
+ ),
+
+ TP_printk("sufile = %p segnum = %llu cnt = %lu",
+ __entry->sufile,
+ __entry->segnum,
+ __entry->cnt)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_allocated,
+ TP_PROTO(struct inode *sufile,
+ __u64 segnum),
+
+ TP_ARGS(sufile, segnum),
+
+ TP_STRUCT__entry(
+ __field(struct inode *, sufile)
+ __field(__u64, segnum)
+ ),
+
+ TP_fast_assign(
+ __entry->sufile = sufile;
+ __entry->segnum = segnum;
+ ),
+
+ TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_freed,
+ TP_PROTO(struct inode *sufile,
+ __u64 segnum),
+
+ TP_ARGS(sufile, segnum),
+
+ TP_STRUCT__entry(
+ __field(struct inode *, sufile)
+ __field(__u64, segnum)
+ ),
+
+ TP_fast_assign(
+ __entry->sufile = sufile;
+ __entry->segnum = segnum;
+ ),
+
+ TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_mdt_insert_new_block,
+ TP_PROTO(struct inode *inode,
+ unsigned long ino,
+ unsigned long block),
+
+ TP_ARGS(inode, ino, block),
+
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned long, ino)
+ __field(unsigned long, block)
+ ),
+
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->block = block;
+ ),
+
+ TP_printk("inode = %p ino = %lu block = %lu",
+ __entry->inode,
+ __entry->ino,
+ __entry->block)
+);
+
+TRACE_EVENT(nilfs2_mdt_submit_block,
+ TP_PROTO(struct inode *inode,
+ unsigned long ino,
+ unsigned long blkoff,
+ int mode),
+
+ TP_ARGS(inode, ino, blkoff, mode),
+
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned long, ino)
+ __field(unsigned long, blkoff)
+ __field(int, mode)
+ ),
+
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->blkoff = blkoff;
+ __entry->mode = mode;
+ ),
+
+ TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x",
+ __entry->inode,
+ __entry->ino,
+ __entry->blkoff,
+ __entry->mode)
+);
+
+#endif /* _TRACE_NILFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE nilfs2
+#include <trace/define_trace.h>
# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
#endif
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */
+
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
+#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
#endif /* __ASM_GENERIC_MMAN_H */
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
#define __NR_membarrier 283
__SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_mlock2 284
+__SYSCALL(__NR_mlock2, sys_mlock2)
#undef __NR_syscalls
-#define __NR_syscalls 284
+#define __NR_syscalls 285
/*
* All syscalls below here should go away really,
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
size_t len = src->m_ts;
size_t alen;
- WARN_ON(dst == NULL);
if (src->m_ts > dst->m_ts)
return ERR_PTR(-EINVAL);
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM) {
if (audit_pid && audit_pid == current->pid)
- gfp_mask &= ~__GFP_WAIT;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
}
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
long sleep_time;
sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
}
/**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
*
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
* mems_allowed to the kernel log.
*/
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
{
struct cgroup *cgrp;
rcu_read_lock();
- cgrp = task_cs(tsk)->css.cgroup;
- pr_info("%s cpuset=", tsk->comm);
+ cgrp = task_cs(current)->css.cgroup;
+ pr_info("%s cpuset=", current->comm);
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+ pr_cont(" mems_allowed=%*pbl\n",
+ nodemask_pr_args(¤t->mems_allowed));
rcu_read_unlock();
}
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &=
+ ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
int err, ro = 0;
/*
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ if (!page->mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = page->mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/capability.h>
#include <linux/mm.h>
crash_notes = __alloc_percpu(size, align);
if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ pr_warn("Memory allocation for saving cpu register states failed\n");
return -ENOMEM;
}
return 0;
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
return;
/* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_WAIT))
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
/* this guy won't enter reclaim */
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
+#include <linux/console.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
bust_spinlocks(0);
+ /*
+ * We may have ended up stopping the CPU holding the lock (in
+ * smp_send_stop()) while still having some valuable data in the console
+ * buffer. Try to acquire the lock then release it regardless of the
+ * result. The release will also print the buffers out.
+ */
+ console_trylock();
+ console_unlock();
+
if (!panic_blink)
panic_blink = no_blink;
}
EXPORT_SYMBOL(param_get_charp);
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
{
maybe_kfree_parameter(*((char **)arg));
}
+EXPORT_SYMBOL(param_free_charp);
const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
while (to_alloc-- > 0) {
struct page *page;
- page = alloc_image_page(__GFP_HIGHMEM);
+ page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT |
+ src = (void *)__get_free_page(__GFP_RECLAIM |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT | __GFP_NOWARN |
- __GFP_NORETRY);
+ __GFP_RECLAIM | __GFP_HIGH :
+ __GFP_RECLAIM | __GFP_NOWARN |
+ __GFP_NORETRY);
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_mem(cpu);
+ node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
page = __alloc_pages_node(node,
return !tsk->ptrace;
}
-/*
- * Notify the system that a driver wants to block all signals for this
- * process, and wants to be notified if any signals at all were to be
- * sent/acted upon. If the notifier routine returns non-zero, then the
- * signal will be acted upon after all. If the notifier routine returns 0,
- * then then signal will be blocked. Only one block per process is
- * allowed. priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.
- */
-void
-block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
-{
- unsigned long flags;
-
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- current->notifier_mask = mask;
- current->notifier_data = priv;
- current->notifier = notifier;
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
-}
-
-/* Notify the system that blocking has ended. */
-
-void
-unblock_all_signals(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- current->notifier = NULL;
- current->notifier_data = NULL;
- recalc_sigpending();
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
-}
-
static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
{
int sig = next_signal(pending, mask);
- if (sig) {
- if (current->notifier) {
- if (sigismember(current->notifier_mask, sig)) {
- if (!(current->notifier)(current->notifier_data)) {
- clear_thread_flag(TIF_SIGPENDING);
- return 0;
- }
- }
- }
-
+ if (sig)
collect_signal(sig, pending, info);
- }
-
return sig;
}
sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
- if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ if (!(signal->flags & SIGNAL_GROUP_EXIT))
return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(sigprocmask);
-EXPORT_SYMBOL(block_all_signals);
-EXPORT_SYMBOL(unblock_all_signals);
-
/*
* System call entry points.
cpumask_var_t cpus;
int cpu, ret;
- might_sleep_if(gfp_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_flags));
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
error = set_one_prio(p, niceval, error);
} while_each_thread(g, p);
if (!uid_eq(uid, cred->uid))
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
cond_syscall(sys_munlock);
cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+ {
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "hardlockup_all_cpu_backtrace",
+ .data = &sysctl_hardlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_SMP */
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic =
+unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif
/*
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static int is_hardlockup(void)
+static bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return 1;
+ return true;
__this_cpu_write(hrtimer_interrupts_saved, hrint);
- return 0;
+ return false;
}
#endif
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
+ struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ panic("Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
+static int watchdog_enable_all_cpus(void);
+static void watchdog_disable_all_cpus(void);
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
/*
* park all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function returns an error if kthread_park() of a watchdog thread
+ * fails. In this situation, the watchdog threads of some CPUs can already
+ * be parked and the watchdog threads of other CPUs can still be runnable.
+ * Callers are expected to handle this special condition as appropriate in
+ * their context.
*/
static int watchdog_park_threads(void)
{
if (ret)
break;
}
- if (ret) {
- for_each_watchdog_cpu(cpu)
- kthread_unpark(per_cpu(softlockup_watchdog, cpu));
- }
put_online_cpus();
return ret;
if (ret == 0)
watchdog_suspended++;
+ else {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to suspend lockup detectors, disabled\n");
+ watchdog_enabled = 0;
+ }
mutex_unlock(&watchdog_proc_mutex);
mutex_unlock(&watchdog_proc_mutex);
}
-static void update_watchdog_all_cpus(void)
+static int update_watchdog_all_cpus(void)
{
- watchdog_park_threads();
+ int ret;
+
+ ret = watchdog_park_threads();
+ if (ret)
+ return ret;
+
watchdog_unpark_threads();
+
+ return 0;
}
static int watchdog_enable_all_cpus(void)
* Enable/disable the lockup detectors or
* change the sample period 'on the fly'.
*/
- update_watchdog_all_cpus();
+ err = update_watchdog_all_cpus();
+
+ if (err) {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to update lockup detectors, disabled\n");
+ }
}
+ if (err)
+ watchdog_enabled = 0;
+
return err;
}
-/* prepare/enable/disable routines */
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
static void watchdog_disable_all_cpus(void)
{
if (watchdog_running) {
}
}
+#ifdef CONFIG_SYSCTL
+
/*
* Update the run state of the lockup detectors.
*/
} while (cmpxchg(&watchdog_enabled, old, new) != old);
/*
- * Update the run state of the lockup detectors.
- * Restore 'watchdog_enabled' on failure.
+ * Update the run state of the lockup detectors. There is _no_
+ * need to check the value returned by proc_watchdog_update()
+ * and to restore the previous value of 'watchdog_enabled' as
+ * both lockup detectors are disabled if proc_watchdog_update()
+ * returns an error.
*/
err = proc_watchdog_update();
- if (err)
- watchdog_enabled = old;
}
out:
mutex_unlock(&watchdog_proc_mutex);
goto out;
/*
- * Update the sample period.
- * Restore 'watchdog_thresh' on failure.
+ * Update the sample period. Restore on failure.
*/
set_sample_period();
err = proc_watchdog_update();
- if (err)
+ if (err) {
watchdog_thresh = old;
+ set_sample_period();
+ }
out:
mutex_unlock(&watchdog_proc_mutex);
return err;
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
If unsure, say N.
+config DEBUG_VM_PGFLAGS
+ bool "Debug page-flags operations"
+ depends on DEBUG_VM
+ help
+ Enables extra validation on page flags operations.
+
+ If unsure, say N.
+
config DEBUG_VIRTUAL
bool "Debug VM translations"
depends on DEBUG_KERNEL && X86
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
+config TEST_PRINTF
+ tristate "Test printf() family of functions at runtime"
+
config TEST_RHASHTABLE
tristate "Perform selftest on resizable hash table"
default n
If unsure, say N.
+config DMA_API_DEBUG_POISON
+ bool "Poison coherent DMA buffers"
+ depends on DMA_API_DEBUG && EXPERT
+ help
+ Poison DMA buffers returned by dma_alloc_coherent unless __GFP_ZERO
+ is explicitly specified, to catch drivers depending on zeroed buffers
+ without passing the correct flags.
+
+ Only say Y if you're prepared for almost everything to break.
+
config TEST_LKM
tristate "Test module loading with 'hello world' module"
default n
global variables requires gcc 5.0 or later.
This feature consumes about 1/8 of available memory and brings about
~x3 performance slowdown.
- For better error detection enable CONFIG_STACKTRACE,
- and add slub_debug=U to boot cmdline.
+ For better error detection enable CONFIG_STACKTRACE.
choice
prompt "Instrumentation type"
obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
+obj-$(CONFIG_TEST_PRINTF) += test_printf.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
--- /dev/null
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
#include <linux/sched.h>
#include <linux/ctype.h>
#include <linux/list.h>
+#include <linux/poison.h>
#include <linux/slab.h>
#include <asm/sections.h>
dir2name[entry->direction],
dir2name[ref->direction]);
+ if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+ ref->sg_call_ents != entry->sg_call_ents) {
+ err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+ "DMA sg list with different entry count "
+ "[map count=%d] [sync count=%d]\n",
+ entry->sg_call_ents, ref->sg_call_ents);
+ }
+
out:
put_hash_bucket(bucket, &flags);
}
EXPORT_SYMBOL(debug_dma_unmap_sg);
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt, gfp_t flags)
{
struct dma_debug_entry *entry;
if (unlikely(virt == NULL))
return;
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG_POISON) && !(flags & __GFP_ZERO))
+ memset(virt, DMA_ALLOC_POISON, size);
+
entry = dma_entry_alloc();
if (!entry)
return;
struct ddebug_table {
struct list_head link;
- char *mod_name;
+ const char *mod_name;
unsigned int num_ddebugs;
struct _ddebug *ddebugs;
};
const char *name)
{
struct ddebug_table *dt;
- char *new_name;
+ const char *new_name;
dt = kzalloc(sizeof(*dt), GFP_KERNEL);
if (dt == NULL)
return -ENOMEM;
- new_name = kstrdup(name, GFP_KERNEL);
+ new_name = kstrdup_const(name, GFP_KERNEL);
if (new_name == NULL) {
kfree(dt);
return -ENOMEM;
static void ddebug_table_free(struct ddebug_table *dt)
{
list_del_init(&dt->link);
- kfree(dt->mod_name);
+ kfree_const(dt->mod_name);
kfree(dt);
}
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/cryptohash.h>
+#include <linux/bitops.h>
/* F, G and H are basic MD4 functions: selection, majority, parity */
#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
* Rotation is separate from addition to prevent recomputation
*/
#define ROUND(f, a, b, c, d, x, s) \
- (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s)))
+ (a += f(b, c, d) + x, a = rol32(a, s))
#define K1 0
#define K2 013240474631UL
#define K3 015666365641UL
* allocation guarantee. Disallow usage from those contexts.
*/
WARN_ON_ONCE(in_interrupt());
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask));
preempt_disable();
struct idr_layer *pa[MAX_IDR_LEVEL + 1];
int id;
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask));
/* sanity checks */
if (WARN_ON_ONCE(start < 0))
if (unlikely(p == task->group_leader))
continue;
- t = p;
- do {
+ for_each_thread(p, t) {
if (unlikely(t->mm == mm))
goto found;
if (likely(t->mm))
* forked before exiting.
*/
smp_rmb();
- } while_each_thread(p, t);
+ }
}
ret = true;
found:
}
EXPORT_SYMBOL(kvasprintf);
+/*
+ * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
+ * (or the sole vararg) points to rodata, we will then save a memory
+ * allocation and string copy. In any case, the return value should be
+ * freed using kfree_const().
+ */
+const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
+{
+ if (!strchr(fmt, '%'))
+ return kstrdup_const(fmt, gfp);
+ if (!strcmp(fmt, "%s"))
+ return kstrdup_const(va_arg(ap, const char*), gfp);
+ return kvasprintf(gfp, fmt, ap);
+}
+EXPORT_SYMBOL(kvasprintf_const);
+
char *kasprintf(gfp_t gfp, const char *fmt, ...)
{
va_list ap;
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
va_list vargs)
{
- char *s;
+ const char *s;
if (kobj->name && !fmt)
return 0;
- s = kvasprintf(GFP_KERNEL, fmt, vargs);
+ s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
if (!s)
return -ENOMEM;
- /* ewww... some of these buggers have '/' in the name ... */
- strreplace(s, '/', '!');
- kfree(kobj->name);
+ /*
+ * ewww... some of these buggers have '/' in the name ... If
+ * that's the case, we need to make sure we have an actual
+ * allocated copy to modify, since kvasprintf_const may have
+ * returned something from .rodata.
+ */
+ if (strchr(s, '/')) {
+ char *t;
+
+ t = kstrdup(s, GFP_KERNEL);
+ kfree_const(s);
+ if (!t)
+ return -ENOMEM;
+ strreplace(t, '/', '!');
+ s = t;
+ }
+ kfree_const(kobj->name);
kobj->name = s;
return 0;
envp[0] = devpath_string;
envp[1] = NULL;
- name = dup_name = kstrdup(new_name, GFP_KERNEL);
+ name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
if (!name) {
error = -ENOMEM;
goto out;
kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
- kfree(dup_name);
+ kfree_const(dup_name);
kfree(devpath_string);
kfree(devpath);
kobject_put(kobj);
/* free name if we allocated it */
if (name) {
pr_debug("kobject: '%s': free name\n", name);
- kfree(name);
+ kfree_const(name);
}
}
{
struct llist_node *entry, *old_entry, *next;
- entry = head->first;
+ entry = smp_load_acquire(&head->first);
for (;;) {
if (entry == NULL)
return NULL;
old_entry = entry;
- next = entry->next;
+ next = READ_ONCE(entry->next);
entry = cmpxchg(&head->first, old_entry, next);
if (entry == old_entry)
break;
* TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course).
*
* @gfp indicates whether or not to wait until a free id is available (it's not
- * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep
+ * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep
* however long it takes until another thread frees an id (same semantics as a
* mempool).
*
* preloading in the interrupt anyway as all the allocations have to
* be atomic. So just do normal allocation when in interrupt.
*/
- if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
+ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
struct radix_tree_preload *rtp;
/*
* with preemption not disabled.
*
* To make use of this facility, the radix tree must be initialised without
- * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
static int __radix_tree_preload(gfp_t gfp_mask)
{
* with preemption not disabled.
*
* To make use of this facility, the radix tree must be initialised without
- * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
int radix_tree_preload(gfp_t gfp_mask)
{
/* Warn on non-sensical use... */
- WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
return __radix_tree_preload(gfp_mask);
}
EXPORT_SYMBOL(radix_tree_preload);
*/
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
- if (gfp_mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(gfp_mask))
return __radix_tree_preload(gfp_mask);
/* Preloading doesn't help anything with this gfp mask, skip it */
preempt_disable();
kfree(out_test);
}
+#define string_get_size_maxbuf 16
+#define test_string_get_size_one(size, blk_size, units, exp_result) \
+ do { \
+ BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \
+ __test_string_get_size((size), (blk_size), (units), \
+ (exp_result)); \
+ } while (0)
+
+
+static __init void __test_string_get_size(const u64 size, const u64 blk_size,
+ const enum string_size_units units,
+ const char *exp_result)
+{
+ char buf[string_get_size_maxbuf];
+
+ string_get_size(size, blk_size, units, buf, sizeof(buf));
+ if (!memcmp(buf, exp_result, strlen(exp_result) + 1))
+ return;
+
+ buf[sizeof(buf) - 1] = '\0';
+ pr_warn("Test 'test_string_get_size_one' failed!\n");
+ pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n",
+ size, blk_size, units);
+ pr_warn("expected: '%s', got '%s'\n", exp_result, buf);
+}
+
+static __init void test_string_get_size(void)
+{
+ test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB");
+ test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB");
+ test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B");
+}
+
static int __init test_string_helpers_init(void)
{
unsigned int i;
for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
+ /* Test string_get_size() */
+ test_string_get_size();
+
return -EINVAL;
}
module_init(test_string_helpers_init);
kfree(ptr2);
}
+static noinline void __init kmalloc_oob_memset_2(void)
+{
+ char *ptr;
+ size_t size = 8;
+
+ pr_info("out-of-bounds in memset2\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ memset(ptr+7, 0, 2);
+ kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_memset_4(void)
+{
+ char *ptr;
+ size_t size = 8;
+
+ pr_info("out-of-bounds in memset4\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ memset(ptr+5, 0, 4);
+ kfree(ptr);
+}
+
+
+static noinline void __init kmalloc_oob_memset_8(void)
+{
+ char *ptr;
+ size_t size = 8;
+
+ pr_info("out-of-bounds in memset8\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ memset(ptr+1, 0, 8);
+ kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_memset_16(void)
+{
+ char *ptr;
+ size_t size = 16;
+
+ pr_info("out-of-bounds in memset16\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ memset(ptr+1, 0, 16);
+ kfree(ptr);
+}
+
static noinline void __init kmalloc_oob_in_memset(void)
{
char *ptr;
kmalloc_oob_krealloc_less();
kmalloc_oob_16();
kmalloc_oob_in_memset();
+ kmalloc_oob_memset_2();
+ kmalloc_oob_memset_4();
+ kmalloc_oob_memset_8();
+ kmalloc_oob_memset_16();
kmalloc_uaf();
kmalloc_uaf_memset();
kmalloc_uaf2();
--- /dev/null
+/*
+ * Test cases for printf facility.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/socket.h>
+#include <linux/in.h>
+
+#define BUF_SIZE 256
+#define FILL_CHAR '$'
+
+#define PTR1 ((void*)0x01234567)
+#define PTR2 ((void*)(long)(int)0xfedcba98)
+
+#if BITS_PER_LONG == 64
+#define PTR1_ZEROES "000000000"
+#define PTR1_SPACES " "
+#define PTR1_STR "1234567"
+#define PTR2_STR "fffffffffedcba98"
+#define PTR_WIDTH 16
+#else
+#define PTR1_ZEROES "0"
+#define PTR1_SPACES " "
+#define PTR1_STR "1234567"
+#define PTR2_STR "fedcba98"
+#define PTR_WIDTH 8
+#endif
+#define PTR_WIDTH_STR stringify(PTR_WIDTH)
+
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+static char *test_buffer __initdata;
+
+static int __printf(4, 0) __init
+do_test(int bufsize, const char *expect, int elen,
+ const char *fmt, va_list ap)
+{
+ va_list aq;
+ int ret, written;
+
+ total_tests++;
+
+ memset(test_buffer, FILL_CHAR, BUF_SIZE);
+ va_copy(aq, ap);
+ ret = vsnprintf(test_buffer, bufsize, fmt, aq);
+ va_end(aq);
+
+ if (ret != elen) {
+ pr_warn("vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n",
+ bufsize, fmt, ret, elen);
+ return 1;
+ }
+
+ if (!bufsize) {
+ if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) {
+ pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n",
+ fmt);
+ return 1;
+ }
+ return 0;
+ }
+
+ written = min(bufsize-1, elen);
+ if (test_buffer[written]) {
+ pr_warn("vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n",
+ bufsize, fmt);
+ return 1;
+ }
+
+ if (memcmp(test_buffer, expect, written)) {
+ pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
+ bufsize, fmt, test_buffer, written, expect);
+ return 1;
+ }
+ return 0;
+}
+
+static void __printf(3, 4) __init
+__test(const char *expect, int elen, const char *fmt, ...)
+{
+ va_list ap;
+ int rand;
+ char *p;
+
+ BUG_ON(elen >= BUF_SIZE);
+
+ va_start(ap, fmt);
+
+ /*
+ * Every fmt+args is subjected to four tests: Three where we
+ * tell vsnprintf varying buffer sizes (plenty, not quite
+ * enough and 0), and then we also test that kvasprintf would
+ * be able to print it as expected.
+ */
+ failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap);
+ rand = 1 + prandom_u32_max(elen+1);
+ /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */
+ failed_tests += do_test(rand, expect, elen, fmt, ap);
+ failed_tests += do_test(0, expect, elen, fmt, ap);
+
+ p = kvasprintf(GFP_KERNEL, fmt, ap);
+ if (p) {
+ if (memcmp(p, expect, elen+1)) {
+ pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n",
+ fmt, p, expect);
+ failed_tests++;
+ }
+ kfree(p);
+ }
+ va_end(ap);
+}
+
+#define test(expect, fmt, ...) \
+ __test(expect, strlen(expect), fmt, ##__VA_ARGS__)
+
+static void __init
+test_basic(void)
+{
+ /* Work around annoying "warning: zero-length gnu_printf format string". */
+ char nul = '\0';
+
+ test("", &nul);
+ test("100%", "100%%");
+ test("xxx%yyy", "xxx%cyyy", '%');
+ __test("xxx\0yyy", 7, "xxx%cyyy", '\0');
+}
+
+static void __init
+test_number(void)
+{
+ test("0x1234abcd ", "%#-12x", 0x1234abcd);
+ test(" 0x1234abcd", "%#12x", 0x1234abcd);
+ test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234);
+}
+
+static void __init
+test_string(void)
+{
+ test("", "%s%.0s", "", "123");
+ test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456");
+ test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5");
+ /*
+ * POSIX and C99 say that a missing precision should be
+ * treated as a precision of 0. However, the kernel's printf
+ * implementation treats this case as if the . wasn't
+ * present. Let's add a test case documenting the current
+ * behaviour; should anyone ever feel the need to follow the
+ * standards more closely, this can be revisited.
+ */
+ test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c");
+ test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c");
+}
+
+static void __init
+plain(void)
+{
+ test(PTR1_ZEROES PTR1_STR " " PTR2_STR, "%p %p", PTR1, PTR2);
+ /*
+ * The field width is overloaded for some %p extensions to
+ * pass another piece of information. For plain pointers, the
+ * behaviour is slightly odd: One cannot pass either the 0
+ * flag nor a precision to %p without gcc complaining, and if
+ * one explicitly gives a field width, the number is no longer
+ * zero-padded.
+ */
+ test("|" PTR1_STR PTR1_SPACES " | " PTR1_SPACES PTR1_STR "|",
+ "|%-*p|%*p|", PTR_WIDTH+2, PTR1, PTR_WIDTH+2, PTR1);
+ test("|" PTR2_STR " | " PTR2_STR "|",
+ "|%-*p|%*p|", PTR_WIDTH+2, PTR2, PTR_WIDTH+2, PTR2);
+
+ /*
+ * Unrecognized %p extensions are treated as plain %p, but the
+ * alphanumeric suffix is ignored (that is, does not occur in
+ * the output.)
+ */
+ test("|"PTR1_ZEROES PTR1_STR"|", "|%p0y|", PTR1);
+ test("|"PTR2_STR"|", "|%p0y|", PTR2);
+}
+
+static void __init
+symbol_ptr(void)
+{
+}
+
+static void __init
+kernel_ptr(void)
+{
+}
+
+static void __init
+struct_resource(void)
+{
+}
+
+static void __init
+addr(void)
+{
+}
+
+static void __init
+escaped_str(void)
+{
+}
+
+static void __init
+hex_string(void)
+{
+ const char buf[3] = {0xc0, 0xff, 0xee};
+
+ test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee",
+ "%3ph|%3phC|%3phD|%3phN", buf, buf, buf, buf);
+ test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee",
+ "%*ph|%*phC|%*phD|%*phN", 3, buf, 3, buf, 3, buf, 3, buf);
+}
+
+static void __init
+mac(void)
+{
+ const u8 addr[6] = {0x2d, 0x48, 0xd6, 0xfc, 0x7a, 0x05};
+
+ test("2d:48:d6:fc:7a:05", "%pM", addr);
+ test("05:7a:fc:d6:48:2d", "%pMR", addr);
+ test("2d-48-d6-fc-7a-05", "%pMF", addr);
+ test("2d48d6fc7a05", "%pm", addr);
+ test("057afcd6482d", "%pmR", addr);
+}
+
+static void __init
+ip4(void)
+{
+ struct sockaddr_in sa;
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = cpu_to_be16(12345);
+ sa.sin_addr.s_addr = cpu_to_be32(0x7f000001);
+
+ test("127.000.000.001|127.0.0.1", "%pi4|%pI4", &sa.sin_addr, &sa.sin_addr);
+ test("127.000.000.001|127.0.0.1", "%piS|%pIS", &sa, &sa);
+ sa.sin_addr.s_addr = cpu_to_be32(0x01020304);
+ test("001.002.003.004:12345|1.2.3.4:12345", "%piSp|%pISp", &sa, &sa);
+}
+
+static void __init
+ip6(void)
+{
+}
+
+static void __init
+ip(void)
+{
+ ip4();
+ ip6();
+}
+
+static void __init
+uuid(void)
+{
+ const char uuid[16] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf};
+
+ test("00010203-0405-0607-0809-0a0b0c0d0e0f", "%pUb", uuid);
+ test("00010203-0405-0607-0809-0A0B0C0D0E0F", "%pUB", uuid);
+ test("03020100-0504-0706-0809-0a0b0c0d0e0f", "%pUl", uuid);
+ test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid);
+}
+
+static void __init
+dentry(void)
+{
+}
+
+static void __init
+struct_va_format(void)
+{
+}
+
+static void __init
+struct_clk(void)
+{
+}
+
+static void __init
+bitmap(void)
+{
+ DECLARE_BITMAP(bits, 20);
+ const int primes[] = {2,3,5,7,11,13,17,19};
+ int i;
+
+ bitmap_zero(bits, 20);
+ test("00000|00000", "%20pb|%*pb", bits, 20, bits);
+ test("|", "%20pbl|%*pbl", bits, 20, bits);
+
+ for (i = 0; i < ARRAY_SIZE(primes); ++i)
+ set_bit(primes[i], bits);
+ test("a28ac|a28ac", "%20pb|%*pb", bits, 20, bits);
+ test("2-3,5,7,11,13,17,19|2-3,5,7,11,13,17,19", "%20pbl|%*pbl", bits, 20, bits);
+
+ bitmap_fill(bits, 20);
+ test("fffff|fffff", "%20pb|%*pb", bits, 20, bits);
+ test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits);
+}
+
+static void __init
+netdev_features(void)
+{
+}
+
+static void __init
+test_pointer(void)
+{
+ plain();
+ symbol_ptr();
+ kernel_ptr();
+ struct_resource();
+ addr();
+ escaped_str();
+ hex_string();
+ mac();
+ ip();
+ uuid();
+ dentry();
+ struct_va_format();
+ struct_clk();
+ bitmap();
+ netdev_features();
+}
+
+static int __init
+test_printf_init(void)
+{
+ test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+ if (!test_buffer)
+ return -ENOMEM;
+
+ test_basic();
+ test_number();
+ test_string();
+ test_pointer();
+
+ kfree(test_buffer);
+
+ if (failed_tests == 0)
+ pr_info("all %u tests passed\n", total_tests);
+ else
+ pr_warn("failed %u out of %u tests\n", failed_tests, total_tests);
+
+ return failed_tests ? -EINVAL : 0;
+}
+
+module_init(test_printf_init);
+
+MODULE_AUTHOR("Rasmus Villemoes <linux@rasmusvillemoes.dk>");
+MODULE_LICENSE("GPL");
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
*
+ * ** Please update also Documentation/printk-formats.txt when making changes **
+ *
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
* pointer to the real address.
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
struct printf_spec spec)
{
- int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
+ const int default_width = 2 * sizeof(void *);
if (!ptr && *fmt != 'K') {
/*
case 'n':
/*
- * Since %n poses a greater security risk than utility, treat
- * it as an invalid format specifier. Warn about its use so
- * that new instances don't get added.
+ * Since %n poses a greater security risk than
+ * utility, treat it as any other invalid or
+ * unsupported format specifier.
*/
- WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", fmt);
/* Fall-through */
default:
+ WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt);
spec->type = FORMAT_TYPE_INVALID;
return fmt - start;
}
* @fmt: The format string to use
* @args: Arguments for the format string
*
- * This function follows C99 vsnprintf, but has some extensions:
- * %pS output the name of a text symbol with offset
- * %ps output the name of a text symbol without offset
- * %pF output the name of a function pointer with its offset
- * %pf output the name of a function pointer without its offset
- * %pB output the name of a backtrace symbol with its offset
- * %pR output the address range in a struct resource with decoded flags
- * %pr output the address range in a struct resource with raw flags
- * %pb output the bitmap with field width as the number of bits
- * %pbl output the bitmap as range list with field width as the number of bits
- * %pM output a 6-byte MAC address with colons
- * %pMR output a 6-byte MAC address with colons in reversed order
- * %pMF output a 6-byte MAC address with dashes
- * %pm output a 6-byte MAC address without colons
- * %pmR output a 6-byte MAC address without colons in reversed order
- * %pI4 print an IPv4 address without leading zeros
- * %pi4 print an IPv4 address with leading zeros
- * %pI6 print an IPv6 address with colons
- * %pi6 print an IPv6 address without colons
- * %pI6c print an IPv6 address as specified by RFC 5952
- * %pIS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address
- * %piS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address
- * %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper
- * case.
- * %*pE[achnops] print an escaped buffer
- * %*ph[CDN] a variable-length hex string with a separator (supports up to 64
- * bytes of the input)
- * %pC output the name (Common Clock Framework) or address (legacy clock
- * framework) of a clock
- * %pCn output the name (Common Clock Framework) or address (legacy clock
- * framework) of a clock
- * %pCr output the current rate of a clock
- * %n is ignored
+ * This function generally follows C99 vsnprintf, but has some
+ * extensions and a few limitations:
+ *
+ * %n is unsupported
+ * %p* is handled by pointer()
*
- * ** Please update Documentation/printk-formats.txt when making changes **
+ * See pointer() or Documentation/printk-formats.txt for more
+ * extensive description.
+ *
+ * ** Please update the documentation in both places when making changes **
*
* The return value is the number of characters which would
* be generated for the given input, excluding the trailing
break;
case FORMAT_TYPE_INVALID:
- if (str < end)
- *str = '%';
- ++str;
- break;
+ /*
+ * Presumably the arguments passed gcc's type
+ * checking, but there is no safe or sane way
+ * for us to continue parsing the format and
+ * fetching from the va_list; the remaining
+ * specifiers and arguments would be out of
+ * sync.
+ */
+ goto out;
default:
switch (spec.type) {
}
}
+out:
if (size > 0) {
if (str < end)
*str = '\0';
switch (spec.type) {
case FORMAT_TYPE_NONE:
- case FORMAT_TYPE_INVALID:
case FORMAT_TYPE_PERCENT_CHAR:
break;
+ case FORMAT_TYPE_INVALID:
+ goto out;
case FORMAT_TYPE_WIDTH:
case FORMAT_TYPE_PRECISION:
}
}
+out:
return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
char *str, *end;
const char *args = (const char *)bin_buf;
- if (WARN_ON_ONCE((int) size < 0))
+ if (WARN_ON_ONCE(size > INT_MAX))
return 0;
str = buf;
break;
case FORMAT_TYPE_PERCENT_CHAR:
- case FORMAT_TYPE_INVALID:
if (str < end)
*str = '%';
++str;
break;
+ case FORMAT_TYPE_INVALID:
+ goto out;
+
default: {
unsigned long long num;
} /* switch(spec.type) */
} /* while(*fmt) */
+out:
if (size > 0) {
if (str < end)
*str = '\0';
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
-#
-# If we have space for more page flags then we can enable additional
-# optimizations and functionality.
-#
-# Regular Sparsemem takes page flag bits for the sectionid if it does not
-# use a virtual memmap. Disable extended page flags for 32 bit platforms
-# that require the use of a sectionid in the page flags.
-#
-config PAGEFLAGS_EXTENDED
- def_bool y
- depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
-
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
{
struct bdi_writeback *wb;
- might_sleep_if(gfp & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp));
if (!memcg_css->parent)
return &bdi->wb;
struct balloon_dev_info *balloon = balloon_page_device(page);
int rc = -EAGAIN;
- /*
- * Block others from accessing the 'newpage' when we get around to
- * establishing additional references. We should be the only one
- * holding a reference to the 'newpage' at this point.
- */
- BUG_ON(!trylock_page(newpage));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
if (WARN_ON(!__is_movable_balloon_page(page))) {
dump_page(page, "not movable balloon page");
- unlock_page(newpage);
return rc;
}
if (balloon && balloon->migratepage)
rc = balloon->migratepage(balloon, newpage, page, mode);
- unlock_page(newpage);
return rc;
}
#endif /* CONFIG_BALLOON_COMPACTION */
*/
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
{
- unsigned long mask, offset, pfn, start = 0;
+ unsigned long mask, offset;
+ unsigned long pfn = -1;
+ unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
int ret;
start = bitmap_no + mask + 1;
}
- trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+ trace_cma_alloc(pfn, page, count, align);
pr_debug("%s(): returned %p\n", __func__, page);
return page;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#ifdef CONFIG_TRACEPOINTS
-static const char *const compaction_status_string[] = {
- "deferred",
- "skipped",
- "continue",
- "partial",
- "complete",
- "no_suitable_page",
- "not_suitable_zone",
-};
-#endif
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+ return order == -1;
+}
+
static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned long watermark;
if (cc->contended || fatal_signal_pending(current))
- return COMPACT_PARTIAL;
+ return COMPACT_CONTENDED;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
return COMPACT_COMPLETE;
}
- /*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (cc->order == -1)
+ if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
int fragindex;
unsigned long watermark;
- /*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (order == -1)
+ if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
watermark = low_wmark_pages(zone);
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
- ret = COMPACT_PARTIAL;
+ ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && !compact_scanners_met(cc)) {
- ret = COMPACT_PARTIAL;
+ ret = COMPACT_CONTENDED;
goto out;
}
}
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
+ if (ret == COMPACT_CONTENDED)
+ ret = COMPACT_PARTIAL;
+
return ret;
}
* this makes sure we compact the whole zone regardless of
* cached scanner positions.
*/
- if (cc->order == -1)
+ if (is_via_compact_memory(cc->order))
__reset_isolation_suitable(zone);
- if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+ if (is_via_compact_memory(cc->order) ||
+ !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
if (cc->order > 0) {
{1UL << PG_private, "private" },
{1UL << PG_private_2, "private_2" },
{1UL << PG_writeback, "writeback" },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
{1UL << PG_head, "head" },
- {1UL << PG_tail, "tail" },
-#else
- {1UL << PG_compound, "compound" },
-#endif
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags)
{
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
+ if (PageCompound(page))
+ pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+ pr_cont("\n");
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
if (reason)
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
+ {VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
size_t offset;
void *retval;
- might_sleep_if(mem_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(mem_flags));
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
/*
* Mappings have to be page-aligned
*/
- offset = phys_addr & ~PAGE_MASK;
+ offset = offset_in_page(phys_addr);
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
return;
- offset = virt_addr & ~PAGE_MASK;
+ offset = offset_in_page(virt_addr);
nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
char *p;
while (size) {
- slop = src & ~PAGE_MASK;
+ slop = offset_in_page(src);
clen = size;
if (clen > MAX_MAP_CHUNK - slop)
clen = MAX_MAP_CHUNK - slop;
static struct {
struct fault_attr attr;
- bool ignore_gfp_wait;
+ bool ignore_gfp_reclaim;
bool cache_filter;
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = true,
+ .ignore_gfp_reclaim = true,
.cache_filter = false,
};
if (gfpflags & __GFP_NOFAIL)
return false;
- if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+ if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_wait))
+ &failslab.ignore_gfp_reclaim))
goto fail;
if (!debugfs_create_bool("cache-filter", mode, dir,
&failslab.cache_filter))
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- BUG_ON(page_mapped(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* At this point page must be either written or cleaned by truncate.
}
EXPORT_SYMBOL(filemap_flush);
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping: address space structure to wait for
- * @start_byte: offset in bytes where the range starts
- * @end_byte: offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
- loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret2, ret = 0;
+ int ret = 0;
if (end_byte < start_byte)
goto out;
cond_resched();
}
out:
+ return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them. Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
+{
+ int ret, ret2;
+
+ ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
ret2 = filemap_check_errors(mapping);
if (!ret)
ret = ret2;
}
EXPORT_SYMBOL(filemap_fdatawait_range);
+/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them. Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+ loff_t i_size = i_size_read(mapping->host);
+
+ if (i_size == 0)
+ return;
+
+ __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
/**
* filemap_fdatawait - wait for all under-writeback pages to complete
* @mapping: address space structure to wait for
*
* Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them. Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
*/
int filemap_fdatawait(struct address_space *mapping)
{
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
mem_cgroup_end_page_stat(memcg);
- mem_cgroup_migrate(old, new, true);
+ mem_cgroup_replace_page(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg);
+ gfp_mask, &memcg, false);
if (error)
return error;
}
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return error;
}
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
goto out;
}
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
return -ENOMEM;
ret = add_to_page_cache_lru(page, mapping, offset,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
struct file *file,
pgoff_t offset)
{
- unsigned long ra_pages;
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
/*
* mmap read-around
*/
- ra_pages = max_sane_readahead(ra->ra_pages);
- ra->start = max_t(long, 0, offset - ra_pages / 2);
- ra->size = ra_pages;
- ra->async_size = ra_pages / 4;
+ ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
}
* page is known to the local caching routines.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
#include <linux/pagemap.h>
#include <linux/sched.h>
-/*
+/**
* get_vaddr_frames() - map virtual addresses to pfns
* @start: starting user address
* @nr_frames: number of pages / pfns from start to map
}
}
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int fault_flags = 0;
int ret;
+ /* mlock all present pages, but do not fault in new pages */
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+ return -ENOENT;
/* For mm_populate(), just skip the stack guard page. */
if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
VM_BUG_ON_VMA(end > vma->vm_end, vma);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ gup_flags &= ~FOLL_POPULATE;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_NO_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_EXCEED_SWAP_PTE
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
{
for_each_populated_zone(zone)
nr_zones++;
- /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
recommended_min = pageblock_nr_pages * nr_zones * 2;
/*
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
err = register_shrinker(&huge_zero_page_shrinker);
if (err)
goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
return 0;
err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
khugepaged_slab_exit();
return entry;
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * ->lru in the tail pages is occupied by compound_head.
+ * Let's use ->mapping + ->index in the second tail page as list_head.
+ */
+ return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return VM_FAULT_OOM;
}
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
} else {
int ret;
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
}
/* Caller must hold page table lock. */
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
+ prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
}
goto out_unlock;
}
- if (unlikely(pmd_trans_splitting(pmd))) {
- /* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- pte_free(dst_mm, pgtable);
-
- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
- goto out;
- }
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
get_page(src_page);
- page_dup_rmap(src_page);
+ page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
spin_unlock(ptl);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
- }
-}
-
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
- put_page(page);
- }
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg))) {
+ &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ false);
put_page(pages[i]);
}
kfree(pages);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr);
- mem_cgroup_commit_charge(pages[i], memcg, false);
+ page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- if (page_mapcount(page) == 1) {
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part. We can do it by checking page_mapcount() on each sub-page, but
+ * it's expensive.
+ * The cheaper way is to check page_count() to be equal 1: every
+ * mapcount takes page reference reference, so this way we can
+ * guarantee, that the PMD is the only mapping.
+ * This can give false negative if somebody pinned the page, but that's
+ * fine.
+ */
+ if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
+ get_page(page);
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
} else
new_page = NULL;
- if (unlikely(!new_page)) {
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
if (!page) {
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) {
- split_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
- put_user_huge_page(page);
+ put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp,
+ &memcg, true))) {
put_page(new_page);
if (page) {
- split_huge_page(page);
- put_user_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
+ put_page(page);
} else
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
spin_lock(ptl);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- page_add_new_anon_rmap(new_page, vma, haddr);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ */
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pmd_t orig_pmd;
+
+ if (is_huge_zero_pmd(*pmd))
+ goto out;
+
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+out:
+ spin_unlock(ptl);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
return 1;
}
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
- int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
- goto out;
+ return false;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
- if (ret == 1) {
+ if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ return true;
}
-out:
- return ret;
+ return false;
}
/*
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
}
/*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
*
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
*ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- return -1;
- } else {
- /* Thp mapped by 'pmd' is stable, so we can
- * handle it as it is. */
- return 1;
- }
- }
+ if (likely(pmd_trans_huge(*pmd)))
+ return true;
spin_unlock(*ptl);
- return 0;
+ return false;
}
/*
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag,
spinlock_t **ptl)
{
pgd_t *pgd;
goto unlock;
if (pmd_page(*pmd) != page)
goto unlock;
- /*
- * split_vma() may create temporary aliased mappings. There is
- * no risk as long as all huge pmd are found and have their
- * splitting bit set before __split_huge_page_refcount
- * runs. Finding the same huge pmd more than once during the
- * same rmap walk is not a problem.
- */
- if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd))
- goto unlock;
- if (pmd_trans_huge(*pmd)) {
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
- !pmd_trans_splitting(*pmd));
+ if (pmd_trans_huge(*pmd))
return pmd;
- }
unlock:
spin_unlock(*ptl);
return NULL;
}
-static int __split_huge_page_splitting(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd;
- int ret = 0;
- /* For mmu_notifiers */
- const unsigned long mmun_start = address;
- const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
- if (pmd) {
- /*
- * We can't temporarily set the pmd to null in order
- * to split it, the pmd must remain marked huge at all
- * times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->rwsem to
- * serialize against split_huge_page*.
- */
- pmdp_splitting_flush(vma, address, pmd);
-
- ret = 1;
- spin_unlock(ptl);
- }
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
-{
- int i;
- struct zone *zone = page_zone(page);
- struct lruvec *lruvec;
- int tail_count = 0;
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
-
- compound_lock(page);
- /* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(page);
-
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
- struct page *page_tail = page + i;
-
- /* tail_page->_mapcount cannot change */
- BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
- /*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
- */
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
-
- /* after clearing PageTail the gup refcount can be released */
- smp_mb__after_atomic();
-
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
-
- /* clear PageTail before overwriting first_page */
- smp_wmb();
-
- if (page_is_young(page))
- set_page_young(page_tail);
- if (page_is_idle(page))
- set_page_idle(page_tail);
-
- /*
- * __split_huge_page_splitting() already set the
- * splitting bit in all pmd that could map this
- * hugepage, that will ensure no CPU can alter the
- * mapcount on the head page. The mapcount is only
- * accounted in the head page and it has to be
- * transferred to all tail pages in the below code. So
- * for this code to be safe, the split the mapcount
- * can't change. But that doesn't mean userland can't
- * keep changing and reading the page contents while
- * we transfer the mapcount, so the pmd splitting
- * status is achieved setting a reserved bit in the
- * pmd, not by clearing the present bit.
- */
- page_tail->_mapcount = page->_mapcount;
-
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
-
- page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(page, page_tail, lruvec, list);
- }
- atomic_sub(tail_count, &page->_count);
- BUG_ON(atomic_read(&page->_count) <= 0);
-
- __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
- ClearPageCompound(page);
- compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
- }
-
- /*
- * Only the head page (now become a regular page) is required
- * to be pinned by the caller.
- */
- BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd, _pmd;
- int ret = 0, i;
- pgtable_t pgtable;
- unsigned long haddr;
-
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
- if (pmd_write(*pmd))
- BUG_ON(page_mapcount(page) != 1);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
-
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_pmd_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
- spin_unlock(ptl);
- }
-
- return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
-{
- int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
-
- BUG_ON(!PageHead(page));
- BUG_ON(PageTail(page));
-
- mapcount = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount += __split_huge_page_splitting(page, vma, addr);
- }
- /*
- * It is critical that new vmas are added to the tail of the
- * anon_vma list. This guarantes that if copy_huge_pmd() runs
- * and establishes a child pmd before
- * __split_huge_page_splitting() freezes the parent pmd (so if
- * we fail to prevent copy_huge_pmd() from running until the
- * whole __split_huge_page() is complete), we will still see
- * the newly established pmd of the child later during the
- * walk, to be able to set it as pmd_trans_splitting too.
- */
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
-
- __split_huge_page_refcount(page, list);
-
- mapcount2 = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount2 += __split_huge_page_map(page, vma, addr);
- }
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
- struct anon_vma *anon_vma;
- int ret = 1;
-
- BUG_ON(is_huge_zero_page(page));
- BUG_ON(!PageAnon(page));
-
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(page);
- if (!anon_vma)
- goto out;
- anon_vma_lock_write(anon_vma);
-
- ret = 0;
- if (!PageCompound(page))
- goto out_unlock;
-
- BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
- count_vm_event(THP_SPLIT);
-
- BUG_ON(PageCompound(page));
-out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
-out:
- return ret;
-}
-
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ result = SCAN_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
- return 1;
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
out:
release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
return 0;
}
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page);
+ page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
return NULL;
}
+ prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
-
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
return true;
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd)
+{
+ unsigned long _address;
+ pte_t *pte, pteval;
+ int swapped_in = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (!is_swap_pte(pteval))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated = 0, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
- if (!new_page)
- return;
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
- return;
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
/*
* Prevent all access to pagetables with the exception of
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ result = SCAN_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ result = SCAN_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ result = SCAN_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ result = SCAN_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
+
+ __collapse_huge_page_swapin(mm, vma, address, pmd);
anon_vma_lock_write(vma->anon_vma);
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
goto out;
}
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
*hpage = NULL;
khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
return;
-
out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
- struct page *page;
+ int ret = 0, none_or_zero = 0, result = 0;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page)) {
+ result = SCAN_SCAN_ABORT;
+ goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
goto out_unmap;
+ }
+
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (referenced && writable)
- ret = 1;
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_NO_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+ none_or_zero, result, unmapped);
return ret;
}
pmd_t _pmd;
int i;
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
put_huge_zero_page();
}
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
{
- spinlock_t *ptl;
- struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ bool young, write;
+ int i;
- BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!pmd_trans_huge(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd)))
- goto unlock;
if (vma_is_dax(vma)) {
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ return;
} else if (is_huge_zero_pmd(*pmd)) {
- __split_huge_zero_page_pmd(vma, haddr, pmd);
- } else {
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- unlock:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (!page)
- return;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ write = pmd_write(*pmd);
+ young = pmd_young(*pmd);
- split_huge_page(page);
- put_page(page);
+ /* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
/*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
*/
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
}
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address)
{
- struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
- vma = find_vma(mm, address);
- BUG_ON(vma == NULL);
- split_huge_page_pmd(vma, address, pmd);
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto out;
+ page = pmd_page(*pmd);
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
+ pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd_mm(mm, address, pmd);
+ split_huge_pmd(vma, pmd, address);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, start);
+ split_huge_pmd_address(vma, start);
/*
* If the new end address isn't hpage aligned and it could
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, end);
+ split_huge_pmd_address(vma, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_page_address(next->vm_mm, nstart);
+ split_huge_pmd_address(next, nstart);
+ }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+ pmd = pmd_offset(pud, address);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return;
+ }
+ if (pmd_trans_huge(*pmd)) {
+ if (page == pmd_page(*pmd))
+ __split_huge_pmd_locked(vma, pmd, address, true);
+ spin_unlock(ptl);
+ return;
+ }
+ spin_unlock(ptl);
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ pte_t entry, swp_pte;
+ swp_entry_t swp_entry;
+
+ if (!pte_present(pte[i]))
+ continue;
+ if (page_to_pfn(page) != pte_pfn(pte[i]))
+ continue;
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = ptep_clear_flush(vma, address, pte + i);
+ swp_entry = make_migration_entry(page, pte_write(entry));
+ swp_pte = swp_entry_to_pte(swp_entry);
+ if (pte_soft_dirty(entry))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+ pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long haddr;
+
+ haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, haddr);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte, entry;
+ swp_entry_t swp_entry;
+ int i;
+
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+ if (!page_mapped(page))
+ continue;
+ if (!is_swap_pte(pte[i]))
+ continue;
+
+ swp_entry = pte_to_swp_entry(pte[i]);
+ if (!is_migration_entry(swp_entry))
+ continue;
+ if (migration_entry_to_page(swp_entry) != page)
+ continue;
+
+ entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ entry = pte_mkdirty(entry);
+ if (is_write_migration_entry(swp_entry))
+ entry = maybe_mkwrite(entry, vma);
+
+ flush_dcache_page(page);
+ set_pte_at(vma->vm_mm, address, pte + i, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte + i);
+ }
+ pte_unmap_unlock(pte, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff, pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ unfreeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
}
}
+
+static int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ ret = compound_mapcount(page);
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+
+ return ret;
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ int mapcount;
+ struct page *page_tail = head + tail;
+
+ mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+ /*
+ * tail_page->_count is zero and not changing from under us. But
+ * get_page_unless_zero() may be running from under us on the
+ * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is implemented in C not
+ * using locked ops. spin_unlock on x86 sometime uses locked ops
+ * because of PPro errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and not only on x86),
+ * it's safer to use atomic_add().
+ */
+ atomic_add(mapcount + 1, &page_tail->_count);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb__after_atomic();
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ clear_compound_head(page_tail);
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+
+ page_tail->index = head->index + tail;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ lru_add_page_tail(head, page_tail, lruvec, list);
+
+ return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct zone *zone = page_zone(head);
+ struct lruvec *lruvec;
+ int i, tail_mapcount;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(head, zone);
+
+ spin_lock(&split_queue_lock);
+ if (!list_empty(page_deferred_list(head))) {
+ split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock(&split_queue_lock);
+
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(head);
+
+ tail_mapcount = 0;
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+ atomic_sub(tail_mapcount, &head->_count);
+
+ ClearPageCompound(head);
+ spin_unlock_irq(&zone->lru_lock);
+
+ unfreeze_page(page_anon_vma(head), head);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
+ }
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct anon_vma *anon_vma;
+ int count, mapcount, ret;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+
+ /*
+ * Racy check if we can split the page, before freeze_page() will
+ * split PMDs
+ */
+ if (total_mapcount(head) != page_count(head) - 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ freeze_page(anon_vma, head);
+ VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+ count = page_count(head);
+ mapcount = total_mapcount(head);
+ if (mapcount == count - 1) {
+ __split_huge_page(page, list);
+ ret = 0;
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+ BUG();
+ } else {
+ unfreeze_page(anon_vma, head);
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ list_add_tail(page_deferred_list(page), &split_queue);
+ split_queue_len++;
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /*
+ * Split a page from split_queue will free up at least one page,
+ * at most HPAGE_PMD_NR - 1. We don't track exact number.
+ * Let's use HPAGE_PMD_NR / 2 as ballpark.
+ */
+ return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_init(&split_queue, &list);
+
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ /* race with put_compound_page() */
+ if (!get_page_unless_zero(page)) {
+ list_del_init(page_deferred_list(page));
+ split_queue_len--;
+ }
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ lock_page(page);
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_tail(&list, &split_queue);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+};
#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
static void destroy_compound_gigantic_page(struct page *page,
- unsigned long order)
+ unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
- __ClearPageTail(p);
+ clear_compound_head(p);
set_page_refcounted(p);
- p->first_page = NULL;
}
set_compound_order(page, 0);
__ClearPageHead(page);
}
-static void free_gigantic_page(struct page *page, unsigned order)
+static void free_gigantic_page(struct page *page, unsigned int order)
{
free_contig_range(page_to_pfn(page), 1 << order);
}
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, unsigned order)
+static struct page *alloc_gigantic_page(int nid, unsigned int order)
{
unsigned long nr_pages = 1 << order;
unsigned long ret, pfn, flags;
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static void prep_compound_gigantic_page(struct page *page, unsigned int order);
static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
static inline bool gigantic_page_supported(void) { return true; }
#else
static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
- unsigned long order) { }
+ unsigned int order) { }
static inline int alloc_fresh_gigantic_page(struct hstate *h,
nodemask_t *nodes_allowed) { return 0; }
#endif
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- set_compound_page_dtor(page, NULL);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
destroy_compound_gigantic_page(page, huge_page_order(h));
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, free_huge_page);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
put_page(page); /* free it into the hugepage allocator */
}
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __SetPageHead(page);
__ClearPageReserved(page);
+ __SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
*/
__ClearPageReserved(p);
set_page_count(p, 0);
- p->first_page = page;
- /* Make sure p->first_page is always valid for PageTail() */
- smp_wmb();
- __SetPageTail(p);
+ set_compound_head(p, page);
}
}
return 0;
page = compound_head(page);
- return get_compound_page_dtor(page) == free_huge_page;
+ return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
dissolve_free_huge_page(pfn_to_page(pfn));
}
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
+ * page from any node, and let the buddy allocator itself figure
+ * it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
+ * strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+ int order = huge_page_order(h);
+ gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * We need a VMA to get a memory policy. If we do not
+ * have one, we use the 'nid' argument.
+ *
+ * The mempolicy stuff below has some non-inlined bits
+ * and calls ->vm_ops. That makes it hard to optimize at
+ * compile-time, even when NUMA is off and it does
+ * nothing. This helps the compiler optimize it out.
+ */
+ if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+ /*
+ * If a specific node is requested, make sure to
+ * get memory from there, but only when a node
+ * is explicitly specified.
+ */
+ if (nid != NUMA_NO_NODE)
+ gfp |= __GFP_THISNODE;
+ /*
+ * Make sure to call something that can handle
+ * nid=NUMA_NO_NODE
+ */
+ return alloc_pages_node(nid, gfp, order);
+ }
+
+ /*
+ * OK, so we have a VMA. Fetch the mempolicy and try to
+ * allocate a huge page with it. We will only reach this
+ * when CONFIG_NUMA=y.
+ */
+ do {
+ struct page *page;
+ struct mempolicy *mpol;
+ struct zonelist *zl;
+ nodemask_t *nodemask;
+
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+ mpol_cond_put(mpol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+ if (page)
+ return page;
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
{
struct page *page;
unsigned int r_nid;
if (hstate_is_gigantic(h))
return NULL;
+ if (vma || addr) {
+ VM_WARN_ON_ONCE(!addr || addr == -1);
+ VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+ }
/*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
}
spin_unlock(&hugetlb_lock);
- if (nid == NUMA_NO_NODE)
- page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
- else
- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+ page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
spin_lock(&hugetlb_lock);
if (page) {
INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
- set_compound_page_dtor(page, free_huge_page);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
return page;
}
+/*
+ * Allocate a huge page from 'nid'. Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+ unsigned long addr = -1;
+
+ return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
/*
* This allocation function is useful in the context where vma is irrelevant.
* E.g. soft-offlining uses this function because it only cares physical
spin_unlock(&hugetlb_lock);
if (!page)
- page = alloc_buddy_huge_page(h, nid);
+ page = __alloc_buddy_huge_page_no_mpol(h, nid);
return page;
}
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
if (!page) {
alloc_ok = false;
break;
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
return 1;
}
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page,
+ unsigned int order)
{
if (unlikely(order > (MAX_ORDER - 1)))
prep_compound_gigantic_page(page, order);
struct kobject *hugepages_kobj;
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
};
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
/*
* A subset of global hstate attributes for node devices
module_init(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_add_hstate(unsigned order)
+void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
unsigned long i;
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+ atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- page_dup_rmap(ptepage);
+ page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
+ hugetlb_count_add(pages_per_huge_page(h), dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
if (huge_pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page);
+ hugetlb_count_sub(pages_per_huge_page(h), mm);
+ page_remove_rmap(page, true);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
address += sz;
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
} else
- page_dup_rmap(page);
+ page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
+ hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
+ struct inode *inode = file_inode(vma->vm_file);
int need_wait_lock = 0;
address &= huge_page_mask(h);
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
+ /*
+ * page faults could race with fallocate hole punch. If a page
+ * is faulted between unmap and deallocation, it will still remain
+ * in the punched hole. During hole punch operations, a hugetlb_falloc
+ * structure will be pointed to by i_private. If this fault is for
+ * a page in a hole being punched, wait for the operation to finish
+ * before proceeding.
+ *
+ * Even with this strategy, it is still possible for a page fault to
+ * race with hole punch. In this case, remove_inode_hugepages() will
+ * unmap the page and then remove. Checking i_private as below should
+ * catch most of these races as we want to minimize unmapping a page
+ * multiple times.
+ */
+ if (unlikely(inode->i_private)) {
+ struct hugetlb_falloc *hugetlb_falloc;
+
+ spin_lock(&inode->i_lock);
+ hugetlb_falloc = inode->i_private;
+ if (hugetlb_falloc && hugetlb_falloc->waitq &&
+ idx >= hugetlb_falloc->start &&
+ idx <= hugetlb_falloc->end) {
+ wait_queue_head_t *hugetlb_falloc_waitq;
+ DEFINE_WAIT(hugetlb_fault_wait);
+
+ hugetlb_falloc_waitq = hugetlb_falloc->waitq;
+ prepare_to_wait(hugetlb_falloc_waitq,
+ &hugetlb_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ spin_lock(&inode->i_lock);
+ finish_wait(hugetlb_falloc_waitq, &hugetlb_fault_wait);
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+ unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
/*
* match the virtual addresses, permission and the alignment of the
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages. This is because we use
- * page[2].lru.next for storing cgroup details.
+ * page[2].private for storing cgroup details.
*/
if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
__hugetlb_cgroup_file_init(hstate_index(h));
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
+/*
+ * The set of flags that only affect watermark checking and reclaim
+ * behaviour. This is used by the MM to obey the caller constraints
+ * about IO, FS and watermark checking while ignoring placement
+ * hints such as HIGHMEM usage.
+ */
+#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
+
+/* The GFP flags allowed during early boot */
+#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
+
+/* Control allocation cpuset and node placement constraints */
+#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
+
+/* Do not use these with a slab allocator */
+#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
set_page_count(page, 1);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
- if (get_page_head)
- atomic_inc(&page->first_page->_count);
- get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
int classzone_idx;
int migratetype;
enum zone_type high_zoneidx;
+ bool spread_dirty_pages;
};
/*
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
-extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_page(struct page *page, unsigned int order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
* page cannot be allocated or merged in parallel. Alternatively, it must
* handle invalid values gracefully, and use page_order_unsafe() below.
*/
-static inline unsigned long page_order(struct page *page)
+static inline unsigned int page_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
extern void clear_page_mlock(struct page *page);
/*
- * mlock_migrate_page - called only from migrate_page_copy() to
- * migrate the Mlocked page flag; update statistics.
+ * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
+ * (because that does not go through the full procedure of migration ptes):
+ * to migrate the Mlocked page flag; update statistics.
*/
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
- unsigned long flags;
int nr_pages = hpage_nr_pages(page);
- local_irq_save(flags);
+ /* Holding pmd lock, no change in irq context: __mod is safe */
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
- local_irq_restore(flags);
}
}
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
- struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+ return address;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
if (memory_is_poisoned_1(addr + 1))
return true;
+ /*
+ * If single shadow byte covers 2-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
return false;
if (memory_is_poisoned_1(addr + 3))
return true;
+ /*
+ * If single shadow byte covers 4-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
return false;
if (memory_is_poisoned_1(addr + 7))
return true;
- if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+ /*
+ * If single shadow byte covers 8-byte access, we don't
+ * need to do anything more. Otherwise, test the first
+ * shadow byte.
+ */
+ if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
return false;
return unlikely(*(u8 *)shadow_addr);
if (unlikely(shadow_first_bytes))
return true;
- if (likely(IS_ALIGNED(addr, 8)))
+ /*
+ * If two shadow bytes covers 16-byte access, we don't
+ * need to do anything more. Otherwise, test the last
+ * shadow byte.
+ */
+ if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
return false;
return memory_is_poisoned_1(addr + 15);
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
return true;
}
return false;
static __always_inline void check_memory_region(unsigned long addr,
size_t size, bool write)
{
- struct kasan_access_info info;
-
if (unlikely(size == 0))
return;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- info.access_addr = (void *)addr;
- info.access_size = size;
- info.is_write = write;
- info.ip = _RET_IP_;
- kasan_report_user_access(&info);
+ kasan_report(addr, size, write, _RET_IP_);
return;
}
static int __init kasan_memhotplug_init(void)
{
- pr_err("WARNING: KASan doesn't support memory hot-add\n");
+ pr_err("WARNING: KASAN doesn't support memory hot-add\n");
pr_err("Memory hot-add will be disabled\n");
hotplug_memory_notifier(kasan_mem_notifier, 0);
#endif
};
-void kasan_report_error(struct kasan_access_info *info);
-void kasan_report_user_access(struct kasan_access_info *info);
-
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool kasan_enabled(void)
+static inline bool kasan_report_enabled(void)
{
return !current->kasan_depth;
}
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <adech.fo@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
#include <linux/string.h>
#include <linux/types.h>
#include <linux/kasan.h>
+#include <linux/module.h>
#include <asm/sections.h>
static void print_error_description(struct kasan_access_info *info)
{
- const char *bug_type = "unknown crash";
- u8 shadow_val;
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
info->first_bad_addr = find_first_bad_addr(info->access_addr,
info->access_size);
- shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
- switch (shadow_val) {
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- bug_type = "use after free";
+ /*
+ * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
break;
case KASAN_PAGE_REDZONE:
case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
case KASAN_GLOBAL_REDZONE:
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- bug_type = "out of bounds access";
+ bug_type = "global-out-of-bounds";
break;
case KASAN_STACK_LEFT:
case KASAN_STACK_MID:
case KASAN_STACK_RIGHT:
case KASAN_STACK_PARTIAL:
- bug_type = "out of bounds on stack";
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use-after-free";
break;
}
- pr_err("BUG: KASan: %s in %pS at addr %p\n",
+ pr_err("BUG: KASAN: %s in %pS at addr %p\n",
bug_type, (void *)info->ip,
info->access_addr);
pr_err("%s of size %zu by task %s/%d\n",
static inline bool kernel_or_module_addr(const void *addr)
{
- return (addr >= (void *)_stext && addr < (void *)_end)
- || (addr >= (void *)MODULES_VADDR
- && addr < (void *)MODULES_END);
+ if (addr >= (void *)_stext && addr < (void *)_end)
+ return true;
+ if (is_module_address((unsigned long)addr))
+ return true;
+ return false;
}
static inline bool init_task_stack_addr(const void *addr)
for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
const void *kaddr = kasan_shadow_to_mem(shadow_row);
char buffer[4 + (BITS_PER_LONG/8)*2];
+ char shadow_buf[SHADOW_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
(i == 0) ? ">%p: " : " %p: ", kaddr);
-
- kasan_disable_current();
+ /*
+ * We should not pass a shadow pointer to generic
+ * function, because generic functions may try to
+ * access kasan mapping for the passed address.
+ */
+ memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
print_hex_dump(KERN_ERR, buffer,
DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
- shadow_row, SHADOW_BYTES_PER_ROW, 0);
- kasan_enable_current();
+ shadow_buf, SHADOW_BYTES_PER_ROW, 0);
if (row_is_guilty(shadow_row, shadow))
pr_err("%*c\n",
static DEFINE_SPINLOCK(report_lock);
-void kasan_report_error(struct kasan_access_info *info)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
- print_error_description(info);
- print_address_description(info);
- print_shadow_for_address(info->first_bad_addr);
- pr_err("================================="
- "=================================\n");
- spin_unlock_irqrestore(&report_lock, flags);
-}
-
-void kasan_report_user_access(struct kasan_access_info *info)
+static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
+ const char *bug_type;
+ /*
+ * Make sure we don't end up in loop.
+ */
+ kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
pr_err("================================="
"=================================\n");
- pr_err("BUG: KASan: user-memory-access on address %p\n",
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
- dump_stack();
+ if (info->access_addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+ pr_err("BUG: KASAN: %s on address %p\n",
+ bug_type, info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm,
+ task_pid_nr(current));
+ dump_stack();
+ } else {
+ print_error_description(info);
+ print_address_description(info);
+ print_shadow_for_address(info->first_bad_addr);
+ }
pr_err("================================="
"=================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
+ kasan_enable_current();
}
void kasan_report(unsigned long addr, size_t size,
{
struct kasan_access_info info;
- if (likely(!kasan_enabled()))
+ if (likely(!kasan_report_enabled()))
return;
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
+
kasan_report_error(&info);
}
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
unsigned long flags;
- struct kmemleak_object *object = NULL;
+ struct kmemleak_object *object;
rcu_read_lock();
read_lock_irqsave(&kmemleak_lock, flags);
return atomic_read(&mm->mm_users) == 0;
}
+/*
+ * If the mm isn't the one associated with the current
+ * ksm_scan.mm_slot ksm_exit() will not down_write();up_write() and in
+ * turn the ksm_test_exit() check run inside a mm->mmap_sem critical
+ * section, will not prevent exit_mmap() to run from under us. In
+ * turn, in those cases where we could work with an "mm" that isn't
+ * guaranteed to be associated with the current ksm_scan.mm_slot,
+ * ksm_get_mm() is needed instead of the ksm_test_exit() run inside
+ * the mmap_sem. Return true if the mm_users was incremented or false
+ * if it we failed at taking the mm because it was freed from under
+ * us. If it returns 1, the caller must take care of calling mmput()
+ * after it finishes using the mm.
+ */
+static __always_inline bool ksm_get_mm(struct mm_struct *mm)
+{
+ return likely(atomic_inc_not_zero(&mm->mm_users));
+}
+
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
unsigned long addr)
{
struct vm_area_struct *vma;
- if (ksm_test_exit(mm))
- return NULL;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
return NULL;
*/
put_anon_vma(rmap_item->anon_vma);
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm here to prevent the
+ * exit_mmap to run from under us in mmput().
+ */
+ if (!ksm_get_mm(mm))
+ return;
+
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
up_read(&mm->mmap_sem);
-}
-
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
+ mmput(mm);
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct vm_area_struct *vma;
struct page *page;
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm here to prevent the
+ * exit_mmap to run from under us in mmput().
+ */
+ if (!ksm_get_mm(mm))
+ return NULL;
+
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (!vma)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
out: page = NULL;
}
up_read(&mm->mmap_sem);
+ mmput(mm);
return page;
}
unlock_page(page);
put_page(page);
- if (stable_node->hlist.first)
+ if (!hlist_empty(&stable_node->hlist))
ksm_pages_sharing--;
else
ksm_pages_shared--;
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
if (page == kpage) /* ksm page forked */
return 0;
- if (!(vma->vm_flags & VM_MERGEABLE))
- goto out;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
+ /*
+ * Stable page could be shared by several processes
+ * and last process could own the page among them after
+ * CoW or zapping for every process except last process
+ * happens. Then, page table entry of the page
+ * in last process can have no dirty bit.
+ * In this case, MADV_FREE could discard the page
+ * wrongly.
+ * For preventing it, we mark stable page dirty.
+ */
+ if (!PageDirty(page))
+ SetPageDirty(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
}
}
+out_unlock:
unlock_page(page);
out:
return err;
struct vm_area_struct *vma;
int err = -EFAULT;
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm() here to prevent the
+ * exit_mmap to run from under us in mmput(). Otherwise
+ * rmap_item->anon_vma could point to an anon_vma that has
+ * already been freed (i.e. get_anon_vma() below would run too
+ * late).
+ */
+ if (!ksm_get_mm(mm))
+ return err;
+
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, rmap_item->address);
- if (!vma || vma->vm_start > rmap_item->address)
+ vma = find_mergeable_vma(mm, rmap_item->address);
+ if (!vma)
goto out;
err = try_to_merge_one_page(vma, page, kpage);
get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
+ mmput(mm);
return err;
}
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
- return NULL;
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale which would waste CPU
+ * by doing the preparation work twice at the
+ * next KSM pass.
+ */
+ goto again;
ret = memcmp_pages(page, tree_page);
put_page(tree_page);
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
- struct rb_node *parent = NULL;
+ struct rb_node *parent;
struct stable_node *stable_node;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
+again:
+ parent = NULL;
new = &root->rb_node;
while (*new) {
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
- return NULL;
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale which would waste CPU
+ * by doing the preparation work twice at the
+ * next KSM pass.
+ */
+ goto again;
ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
- if (IS_ERR_OR_NULL(tree_page))
+ if (!tree_page)
return NULL;
/*
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ cond_resched();
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
+ cond_resched();
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
#ifdef CONFIG_MEMCG_KMEM
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
+ /*
+ * This needs node 0 to be always present, even
+ * in the systems supporting sparse numa ids.
+ */
return !!lru->node[0].memcg_lrus;
}
return &nlru->lru;
}
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ struct page *page;
+
+ if (!memcg_kmem_enabled())
+ return NULL;
+ page = virt_to_head_page(ptr);
+ return page->mem_cgroup;
+}
+
static inline struct list_lru_one *
list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
{
int i;
- for (i = 0; i < nr_node_ids; i++) {
- if (!memcg_aware)
- lru->node[i].memcg_lrus = NULL;
- else if (memcg_init_list_lru_node(&lru->node[i]))
+ if (!memcg_aware)
+ return 0;
+
+ for_each_node(i) {
+ if (memcg_init_list_lru_node(&lru->node[i]))
goto fail;
}
return 0;
fail:
- for (i = i - 1; i >= 0; i--)
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
memcg_destroy_list_lru_node(&lru->node[i]);
+ }
return -ENOMEM;
}
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_destroy_list_lru_node(&lru->node[i]);
}
if (!list_lru_memcg_aware(lru))
return 0;
- for (i = 0; i < nr_node_ids; i++) {
+ for_each_node(i) {
if (memcg_update_list_lru_node(&lru->node[i],
old_size, new_size))
goto fail;
}
return 0;
fail:
- for (i = i - 1; i >= 0; i--)
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
+
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
+ }
return -ENOMEM;
}
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
}
if (!list_lru_memcg_aware(lru))
return;
- for (i = 0; i < nr_node_ids; i++)
+ for_each_node(i)
memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
}
if (!lru->node)
goto out;
- for (i = 0; i < nr_node_ids; i++) {
+ for_each_node(i) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
*
* Safely read from address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem. This makes
+ * probe_kernel_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
*/
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
pagefault_enable();
set_fs(old_fs);
- return ret < 0 ? ret : src - unsafe_addr;
+ return ret ? -EFAULT : src - unsafe_addr;
}
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+ pte_t *pte, ptent;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long next;
+ int nr_swap = 0;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_pmd(vma, pmd, addr);
+ else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageSwapCache(page) || PageDirty(page)) {
+ if (!trylock_page(page))
+ continue;
+ /*
+ * If page is shared with others, we couldn't clear
+ * PG_dirty of the page.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (PageSwapCache(page) && !try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+next:
+ cond_resched();
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (!vma_is_anonymous(vma))
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
return 0;
}
-int __init_memblock memblock_remove_range(struct memblock_type *type,
+static int __init_memblock memblock_remove_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
int start_rgn, end_rgn;
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
+#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- int nr_pages)
+ bool compound, int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
+ }
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_oom.may_oom)
+ if (!current->memcg_may_oom)
return;
/*
* We are in the middle of the charge context here, so we
* and when we know whether the fault was overall successful.
*/
css_get(&memcg->css);
- current->memcg_oom.memcg = memcg;
- current->memcg_oom.gfp_mask = mask;
- current->memcg_oom.order = order;
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
}
/**
*/
bool mem_cgroup_oom_synchronize(bool handle)
{
- struct mem_cgroup *memcg = current->memcg_oom.memcg;
+ struct mem_cgroup *memcg = current->memcg_in_oom;
struct oom_wait_info owait;
bool locked;
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
- current->memcg_oom.order);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+ current->memcg_oom_order);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
memcg_oom_recover(memcg);
}
cleanup:
- current->memcg_oom.memcg = NULL;
+ current->memcg_in_oom = NULL;
css_put(&memcg->css);
return true;
}
return NOTIFY_OK;
}
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg, *pos;
+
+ if (likely(!nr_pages))
+ return;
+
+ pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+ do {
+ if (page_counter_read(&pos->memory) <= pos->high)
+ continue;
+ mem_cgroup_events(pos, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+ } while ((pos = parent_mem_cgroup(pos)));
+
+ css_put(&memcg->css);
+ current->memcg_nr_pages_over_high = 0;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- int ret = 0;
if (mem_cgroup_is_root(memcg))
- goto done;
+ return 0;
retry:
if (consume_stock(memcg, nr_pages))
- goto done;
+ return 0;
if (!do_swap_account ||
!page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (unlikely(test_thread_flag(TIF_MEMDIE) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
- goto bypass;
+ goto force;
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
- if (!(gfp_mask & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
goto retry;
if (gfp_mask & __GFP_NOFAIL)
- goto bypass;
+ goto force;
if (fatal_signal_pending(current))
- goto bypass;
+ goto force;
mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
- mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+ mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
-bypass:
- return -EINTR;
+force:
+ /*
+ * The allocation either can't fail or will lead to more memory
+ * being freed very soon. Allow memory usage go over the limit
+ * temporarily by force charging it.
+ */
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
+
+ return 0;
done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- if (!(gfp_mask & __GFP_WAIT))
- goto done;
+
/*
- * If the hierarchy is above the normal consumption range,
- * make the charging task trim their excess contribution.
+ * If the hierarchy is above the normal consumption range, schedule
+ * reclaim on returning to userland. We can perform reclaim here
+ * if __GFP_WAIT but let's always punt for simplicity and so that
+ * GFP_KERNEL can consistently be used during reclaim. @memcg is
+ * not recorded as it most likely matches current's and won't
+ * change in the meantime. As high limit is checked again before
+ * reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
- continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ if (page_counter_read(&memcg->memory) > memcg->high) {
+ current->memcg_nr_pages_over_high += nr_pages;
+ set_notify_resume(current);
+ break;
+ }
} while ((memcg = parent_mem_cgroup(memcg)));
-done:
- return ret;
+
+ return 0;
}
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
-{
- struct page_counter *counter;
- int ret = 0;
-
- ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
- if (ret < 0)
- return ret;
-
- ret = try_charge(memcg, gfp, nr_pages);
- if (ret == -EINTR) {
- /*
- * try_charge() chose to bypass to root due to OOM kill or
- * fatal signal. Since our only options are to either fail
- * the allocation or charge it to this cgroup, do it as a
- * temporary condition. But we can't fail. From a kmem/slab
- * perspective, the cache has already been selected, by
- * mem_cgroup_kmem_get_cache(), so it is too late to change
- * our minds.
- *
- * This condition will only trigger if the task entered
- * memcg_charge_kmem in a sane state, but was OOM-killed
- * during try_charge() above. Tasks that were already dying
- * when the allocation triggers should have been already
- * directed to the root cgroup in memcontrol.h
- */
- page_counter_charge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
- ret = 0;
- } else if (ret)
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
css_put(&cachep->memcg_params.memcg->css);
}
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer. We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
- int ret;
+ unsigned int nr_pages = 1 << order;
+ struct page_counter *counter;
+ int ret = 0;
- *_memcg = NULL;
+ if (!memcg_kmem_is_active(memcg))
+ return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+ if (ret)
+ return ret;
- if (!memcg_kmem_is_active(memcg)) {
- css_put(&memcg->css);
- return true;
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret) {
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ return ret;
}
- ret = memcg_charge_kmem(memcg, gfp, 1 << order);
- if (!ret)
- *_memcg = memcg;
+ page->mem_cgroup = memcg;
- css_put(&memcg->css);
- return (ret == 0);
+ return 0;
}
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
- int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ struct mem_cgroup *memcg;
+ int ret;
- /* The page allocation failed. Revert */
- if (!page) {
- memcg_uncharge_kmem(memcg, 1 << order);
- return;
- }
- page->mem_cgroup = memcg;
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ css_put(&memcg->css);
+ return ret;
}
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
+ unsigned int nr_pages = 1 << order;
if (!memcg)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- memcg_uncharge_kmem(memcg, 1 << order);
- page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
- struct mem_cgroup *memcg = NULL;
- struct kmem_cache *cachep;
- struct page *page;
-
- page = virt_to_head_page(ptr);
- if (PageSlab(page)) {
- cachep = page->slab_cache;
- if (!is_root_cache(cachep))
- memcg = cachep->memcg_params.memcg;
- } else
- /* page allocated by alloc_kmem_pages */
- memcg = page->mem_cgroup;
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_uncharge(&memcg->memsw, nr_pages);
- return memcg;
+ page->mem_cgroup = NULL;
+ css_put_many(&memcg->css, nr_pages);
}
#endif /* CONFIG_MEMCG_KMEM */
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
{
int ret;
- /* Try a single bulk charge without reclaim first */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ /* Try a single bulk charge without reclaim first, kswapd may wake */
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
if (!ret) {
mc.precharge += count;
return ret;
}
- if (ret == -EINTR) {
- cancel_charge(root_mem_cgroup, count);
- return ret;
- }
/* Try charges one by one with reclaim */
while (count--) {
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
- /*
- * In case of failure, any residual charges against
- * mc.to will be dropped by mem_cgroup_clear_mc()
- * later on. However, cancel any charges that are
- * bypassed to root right away or they'll be lost.
- */
- if (ret == -EINTR)
- cancel_charge(root_mem_cgroup, 1);
if (ret)
return ret;
mc.precharge++;
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
+ bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
unsigned long flags;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
+ VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
+ * Prevent mem_cgroup_replace_page() from looking at
+ * page->mem_cgroup of its source page while we change it.
*/
+ ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
+ mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
union mc_target target;
struct page *page;
- /*
- * We don't take compound_lock() here but no race with splitting thp
- * happens because:
- * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
- * under splitting, which means there's no concurrent thp split,
- * - if another thread runs into split_huge_page() just after we
- * entered this if-block, the thread must wait for page table lock
- * to be unlocked in __split_huge_page_splitting(), where the main
- * part of thp split is not executed yet.
- */
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
page = target.page;
if (isolate_lru_page(page))
goto put;
- if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+ return page_counter_read(&mem_cgroup_from_css(css)->memory);
}
static int memory_low_show(struct seq_file *m, void *v)
static struct cftype memory_files[] = {
{
.name = "current",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
{
* with mem_cgroup_cancel_charge() in case page instantiation fails.
*/
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
{
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
}
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
css_put(&memcg->css);
-
- if (ret == -EINTR) {
- memcg = root_mem_cgroup;
- ret = 0;
- }
out:
*memcgp = memcg;
return ret;
* Use mem_cgroup_cancel_charge() to cancel the transaction instead.
*/
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
commit_charge(page, memcg, lrucare);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
cancel_charge(memcg, nr_pages);
}
}
/**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
* @lrucare: either or both pages might be on the LRU already
*
* Both pages must be locked, @newpage->mapping must be set up.
*/
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
- bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
int isolated;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
- VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
newpage);
if (newpage->mem_cgroup)
return;
- /*
- * Swapcache readahead pages can get migrated before being
- * charged, and migration from compaction can happen to an
- * uncharged page when the PFN walker finds a page that
- * reclaim just put back on the LRU but has not released yet.
- */
+ /* Swapcache readahead pages can get replaced before being charged */
memcg = oldpage->mem_cgroup;
if (!memcg)
return;
- if (lrucare)
- lock_page_lru(oldpage, &isolated);
-
+ lock_page_lru(oldpage, &isolated);
oldpage->mem_cgroup = NULL;
+ unlock_page_lru(oldpage, isolated);
- if (lrucare)
- unlock_page_lru(oldpage, isolated);
-
- commit_charge(newpage, memcg, lrucare);
+ commit_charge(newpage, memcg, true);
}
/*
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -1);
+ mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
}
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
#include "internal.h"
#include "ras/ras_event.h"
#define lru (1UL << PG_lru)
#define swapbacked (1UL << PG_swapbacked)
#define head (1UL << PG_head)
-#define tail (1UL << PG_tail)
-#define compound (1UL << PG_compound)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
*/
{ slab, slab, MF_MSG_SLAB, me_kernel },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
{ head, head, MF_MSG_HUGE, me_huge_page },
- { tail, tail, MF_MSG_HUGE, me_huge_page },
-#else
- { compound, compound, MF_MSG_HUGE, me_huge_page },
-#endif
{ sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
{ sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
+ lock_page(hpage);
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
if (!PageAnon(hpage))
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
else
put_hwpoison_page(p);
return -EBUSY;
}
+ unlock_page(hpage);
VM_BUG_ON_PAGE(!page_count(p), p);
hpage = compound_head(p);
}
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
}
core_initcall(memory_failure_init);
+#define unpoison_pr_info(fmt, pfn, rs) \
+({ \
+ if (__ratelimit(rs)) \
+ pr_info(fmt, pfn); \
+})
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
struct page *p;
int freeit = 0;
unsigned int nr_pages;
+ static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (!pfn_valid(pfn))
return -ENXIO;
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+ unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_count(page) > 1) {
- pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_mapped(page)) {
- pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (page_mapping(page)) {
- pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
- pfn);
+ unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
* In such case, we yield to memory_failure() and make unpoison fail.
*/
if (!PageHuge(page) && PageTransHuge(page)) {
- pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+ unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
* to the end.
*/
if (PageHuge(page)) {
- pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
- pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+ pfn, &unpoison_rs);
return 0;
}
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+ pfn, &unpoison_rs);
num_poisoned_pages_sub(nr_pages);
freeit = 1;
if (PageHuge(page))
return -EBUSY;
}
if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ lock_page(page);
+ ret = split_huge_page(hpage);
+ unlock_page(page);
+ if (unlikely(ret)) {
pr_info("soft offline: %#lx: failed to split THP\n",
pfn);
if (flags & MF_COUNT_INCREASED)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
if (!new)
return -ENOMEM;
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
- wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
- } else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ }
spin_unlock(ptl);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- } else
- VM_BUG_ON(pmd_trans_splitting(*pmd));
+ }
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
+ page_dup_rmap(page, false);
if (PageAnon(page))
rss[MM_ANONPAGES]++;
else
mark_page_accessed(page);
rss[MM_FILEPAGES]--;
}
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
BUG();
}
#endif
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
cow_user_page(new_page, old_page, address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
- munlock_vma_page(old_page);
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
- exclusive = 1;
+ exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
set_pte_at(mm, address, page_table, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
goto unlock;
oom_free_page:
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, false);
} else {
inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
page_add_file_rmap(page);
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
}
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
page_cache_release(new_page);
return ret;
}
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
return 0;
}
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- /*
- * If the pmd is splitting, return and retry the
- * the fault. Alternative: wait until the split
- * is done, and goto retry.
- */
- if (pmd_trans_splitting(orig_pmd))
- return 0;
-
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
unsigned long start_pfn, unsigned long num_pages)
{
if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages,
- MEMMAP_HOTPLUG);
+ return init_currently_empty_zone(zone, start_pfn, num_pages);
+
return 0;
}
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
+ int nid, ret;
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
- return 0;
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ } else {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return 0;
+ }
+ }
+ }
+retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
+ if (PageTail(page) && PageAnon(page)) {
+ get_page(page);
+ pte_unmap_unlock(pte, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ /* Failed to split -- skip. */
+ if (ret) {
+ pte = pte_offset_map_lock(walk->mm, pmd,
+ addr, &ptl);
+ continue;
+ }
+ goto retry;
+ }
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, qp->pagelist, flags);
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
- gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
}
/*
- * We use gfp mask w/o __GFP_WAIT or IO for the first round. If
+ * We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
if (gfp_temp != gfp_mask) {
goto repeat_alloc;
}
- /* We must not sleep if !__GFP_WAIT */
- if (!(gfp_mask & __GFP_WAIT)) {
+ /* We must not sleep if !__GFP_DIRECT_RECLAIM */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
}
/*
- * Memory Migration functionality - linux/mm/migration.c
+ * Memory Migration functionality - linux/mm/migrate.c
*
* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
*
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
-#include <linux/memcontrol.h>
+#include <linux/backing-dev.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
- page_dup_rmap(new);
+ page_dup_rmap(new, false);
} else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr);
+ page_add_anon_rmap(new, vma, addr, false);
else
page_add_file_rmap(new);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(new);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, ptep);
unlock:
struct buffer_head *head, enum migrate_mode mode,
int extra_count)
{
+ struct zone *oldzone, *newzone;
+ int dirty;
int expected_count = 1 + extra_count;
void **pslot;
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
return -EAGAIN;
+
+ /* No turning back from here */
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
+
return MIGRATEPAGE_SUCCESS;
}
+ oldzone = page_zone(page);
+ newzone = page_zone(newpage);
+
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
}
/*
- * Now we know that no one else is looking at the page.
+ * Now we know that no one else is looking at the page:
+ * no turning back from here.
*/
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
+
get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
+ /* Move dirty while page refs frozen and newpage not yet exposed */
+ dirty = PageDirty(page);
+ if (dirty) {
+ ClearPageDirty(page);
+ SetPageDirty(newpage);
+ }
+
radix_tree_replace_slot(pslot, newpage);
/*
*/
page_unfreeze_refs(page, expected_count - 1);
+ spin_unlock(&mapping->tree_lock);
+ /* Leave irq disabled to prevent preemption while updating stats */
+
/*
* If moved to a different zone then also account
* the page for that zone. Other VM counters will be
* via NR_FILE_PAGES and NR_ANON_PAGES if they
* are mapped to swap space.
*/
- __dec_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (!PageSwapCache(page) && PageSwapBacked(page)) {
- __dec_zone_page_state(page, NR_SHMEM);
- __inc_zone_page_state(newpage, NR_SHMEM);
+ if (newzone != oldzone) {
+ __dec_zone_state(oldzone, NR_FILE_PAGES);
+ __inc_zone_state(newzone, NR_FILE_PAGES);
+ if (PageSwapBacked(page) && !PageSwapCache(page)) {
+ __dec_zone_state(oldzone, NR_SHMEM);
+ __inc_zone_state(newzone, NR_SHMEM);
+ }
+ if (dirty && mapping_cap_account_dirty(mapping)) {
+ __dec_zone_state(oldzone, NR_FILE_DIRTY);
+ __inc_zone_state(newzone, NR_FILE_DIRTY);
+ }
}
- spin_unlock_irq(&mapping->tree_lock);
+ local_irq_enable();
return MIGRATEPAGE_SUCCESS;
}
int expected_count;
void **pslot;
- if (!mapping) {
- if (page_count(page) != 1)
- return -EAGAIN;
- return MIGRATEPAGE_SUCCESS;
- }
-
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
return -EAGAIN;
}
+ set_page_memcg(newpage, page_memcg(page));
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
if (PageMappedToDisk(page))
SetPageMappedToDisk(newpage);
- if (PageDirty(page)) {
- clear_page_dirty_for_io(page);
- /*
- * Want to mark the page and the radix tree as dirty, and
- * redo the accounting that clear_page_dirty_for_io undid,
- * but we can't use set_page_dirty because that function
- * is actually a signal that all of the page has become dirty.
- * Whereas only part of our page may be dirty.
- */
- if (PageSwapBacked(page))
- SetPageDirty(newpage);
- else
- __set_page_dirty_nobuffers(newpage);
- }
+ /* Move dirty on pages not done by migrate_page_move_mapping() */
+ if (PageDirty(page))
+ SetPageDirty(newpage);
if (page_is_young(page))
set_page_young(newpage);
cpupid = page_cpupid_xchg_last(page, -1);
page_cpupid_xchg_last(newpage, cpupid);
- mlock_migrate_page(newpage, page);
ksm_migrate_page(newpage, page);
/*
* Please do not reorder this without considering how mm/ksm.c's
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int page_was_mapped, enum migrate_mode mode)
+ enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
- /*
- * Block others from accessing the page when we get around to
- * establishing additional references. We are the only one
- * holding a reference to the new page at this point.
- */
- if (!trylock_page(newpage))
- BUG();
-
- /* Prepare mapping for the new page.*/
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- SetPageSwapBacked(newpage);
-
- /*
- * Indirectly called below, migrate_page_copy() copies PG_dirty and thus
- * needs newpage's memcg set to transfer memcg dirty page accounting.
- * So perform memcg migration in two steps:
- * 1. set newpage->mem_cgroup (here)
- * 2. clear page->mem_cgroup (below)
- */
- set_page_memcg(newpage, page_memcg(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
mapping = page_mapping(page);
if (!mapping)
* space which also has its own migratepage callback. This
* is the most common path for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page, mode);
+ rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
else
rc = fallback_migrate_page(mapping, newpage, page, mode);
- if (rc != MIGRATEPAGE_SUCCESS) {
- set_page_memcg(newpage, NULL);
- newpage->mapping = NULL;
- } else {
+ /*
+ * When successful, old pagecache page->mapping must be cleared before
+ * page is freed; but stats require that PageAnon be left as PageAnon.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
set_page_memcg(page, NULL);
- if (page_was_mapped)
- remove_migration_ptes(page, newpage);
- page->mapping = NULL;
+ if (!PageAnon(page))
+ page->mapping = NULL;
}
-
- unlock_page(newpage);
-
return rc;
}
goto out_unlock;
wait_on_page_writeback(page);
}
+
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
+ *
+ * Only page_get_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ * But if we cannot get anon_vma, then we won't need it anyway,
+ * because that implies that the anon page is no longer mapped
+ * (and cannot be remapped so long as we hold the page lock).
*/
- if (PageAnon(page) && !PageKsm(page)) {
- /*
- * Only page_lock_anon_vma_read() understands the subtleties of
- * getting a hold on an anon_vma from outside one of its mms.
- */
+ if (PageAnon(page) && !PageKsm(page))
anon_vma = page_get_anon_vma(page);
- if (anon_vma) {
- /*
- * Anon page
- */
- } else if (PageSwapCache(page)) {
- /*
- * We cannot be sure that the anon_vma of an unmapped
- * swapcache page is safe to use because we don't
- * know in advance if the VMA that this page belonged
- * to still exists. If the VMA and others sharing the
- * data have been freed, then the anon_vma could
- * already be invalid.
- *
- * To avoid this possibility, swapcache pages get
- * migrated but are not remapped when migration
- * completes
- */
- } else {
- goto out_unlock;
- }
- }
+
+ /*
+ * Block others from accessing the new page when we get around to
+ * establishing additional references. We are usually the only one
+ * holding a reference to newpage at this point. We used to have a BUG
+ * here if trylock_page(newpage) fails, but would like to allow for
+ * cases where there might be a race with the previous use of newpage.
+ * This is much like races on refcount of oldpage: just don't BUG().
+ */
+ if (unlikely(!trylock_page(newpage)))
+ goto out_unlock;
if (unlikely(isolated_balloon_page(page))) {
/*
* the page migration right away (proteced by page lock).
*/
rc = balloon_page_migrate(newpage, page, mode);
- goto out_unlock;
+ goto out_unlock_both;
}
/*
VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
- goto out_unlock;
+ goto out_unlock_both;
}
- goto skip_unmap;
- }
-
- /* Establish migration ptes or remove ptes */
- if (page_mapped(page)) {
+ } else if (page_mapped(page)) {
+ /* Establish migration ptes */
+ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+ page);
try_to_unmap(page,
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
-skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, page_was_mapped, mode);
+ rc = move_to_new_page(newpage, page, mode);
- if (rc && page_was_mapped)
- remove_migration_ptes(page, page);
+ if (page_was_mapped)
+ remove_migration_ptes(page,
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+out_unlock_both:
+ unlock_page(newpage);
+out_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
-
-out_unlock:
unlock_page(page);
out:
return rc;
int force, enum migrate_mode mode,
enum migrate_reason reason)
{
- int rc = 0;
+ int rc = MIGRATEPAGE_SUCCESS;
int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ struct page *newpage;
+ newpage = get_new_page(page, private, &result);
if (!newpage)
return -ENOMEM;
goto out;
}
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
+ if (unlikely(PageTransHuge(page))) {
+ lock_page(page);
+ rc = split_huge_page(page);
+ unlock_page(page);
+ if (rc)
goto out;
+ }
rc = __unmap_and_move(page, newpage, force, mode);
+ if (rc == MIGRATEPAGE_SUCCESS)
+ put_new_page = NULL;
out:
if (rc != -EAGAIN) {
* it. Otherwise, putback_lru_page() will drop the reference grabbed
* during isolation.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
- ClearPageSwapBacked(newpage);
+ if (put_new_page)
put_new_page(newpage, private);
- } else if (unlikely(__is_movable_balloon_page(newpage))) {
+ else if (unlikely(__is_movable_balloon_page(newpage))) {
/* drop our reference, page already in the balloon */
put_page(newpage);
} else
struct page *hpage, int force,
enum migrate_mode mode)
{
- int rc = 0;
+ int rc = -EAGAIN;
int *result = NULL;
int page_was_mapped = 0;
struct page *new_hpage;
if (!new_hpage)
return -ENOMEM;
- rc = -EAGAIN;
-
if (!trylock_page(hpage)) {
if (!force || mode != MIGRATE_SYNC)
goto out;
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
+ if (unlikely(!trylock_page(new_hpage)))
+ goto put_anon;
+
if (page_mapped(hpage)) {
try_to_unmap(hpage,
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
}
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
+ rc = move_to_new_page(new_hpage, hpage, mode);
+
+ if (page_was_mapped)
+ remove_migration_ptes(hpage,
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
- if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
- remove_migration_ptes(hpage, hpage);
+ unlock_page(new_hpage);
+put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
+ put_new_page = NULL;
+ }
unlock_page(hpage);
out:
* it. Otherwise, put_page() will drop the reference grabbed during
* isolation.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ if (put_new_page)
put_new_page(new_hpage, private);
else
putback_active_hugepage(new_hpage);
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The caller should call putback_movable_pages() to return pages to the LRU
* or free list only if ret != 0.
*
* Returns the number of pages that were not migrated, or an error code.
}
}
}
- rc = nr_failed + retry;
+ nr_failed += retry;
+ rc = nr_failed;
out:
if (nr_succeeded)
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
- ~GFP_IOFS, 0);
+ ~(__GFP_IO | __GFP_FS), 0);
return newpage;
}
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
+ prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
SetPageActive(page);
if (TestClearPageUnevictable(new_page))
SetPageUnevictable(page);
- mlock_migrate_page(page, new_page);
unlock_page(new_page);
put_page(new_page); /* Free it */
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
+ page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
flush_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
+ page_remove_rmap(new_page, true);
goto fail_putback;
}
- mem_cgroup_migrate(page, new_page, false);
-
- page_remove_rmap(page);
+ mlock_migrate_page(new_page, page);
+ set_page_memcg(new_page, page_memcg(page));
+ set_page_memcg(page, NULL);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
/* This also avoids any overflows on PAGE_CACHE_ALIGN */
pages = len >> PAGE_SHIFT;
- pages += (len & ~PAGE_MASK) != 0;
+ pages += (offset_in_page(len)) != 0;
if (!access_ok(VERIFY_WRITE, vec, pages))
return -EFAULT;
/* Serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
while (start < end) {
struct page *page = NULL;
&page_mask);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
lock_page(page);
/*
* Any THP page found by follow_page_mask() may
goto next;
}
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
- goto out; /* don't set VM_LOCKED, don't count */
+ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+ goto out;
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
return ret;
}
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+ vm_flags_t flags)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * prev;
int error;
- VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
prev = vma;
for (nstart = start ; ; ) {
- vm_flags_t newflags;
+ vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
-
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (on)
- newflags |= VM_LOCKED;
+ newflags |= flags;
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
return error;
}
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
unsigned long lock_limit;
lru_add_drain_all(); /* flush pagevec */
- len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
lock_limit = rlimit(RLIMIT_MEMLOCK);
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = do_mlock(start, len, 1);
+ error = apply_vma_lock_flags(start, len, flags);
up_write(¤t->mm->mmap_sem);
if (error)
return 0;
}
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+ vm_flags_t vm_flags = VM_LOCKED;
+
+ if (flags & ~MLOCK_ONFAULT)
+ return -EINVAL;
+
+ if (flags & MLOCK_ONFAULT)
+ vm_flags |= VM_LOCKONFAULT;
+
+ return do_mlock(start, len, vm_flags);
+}
+
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
down_write(¤t->mm->mmap_sem);
- ret = do_mlock(start, len, 0);
+ ret = apply_vma_lock_flags(start, len, 0);
up_write(¤t->mm->mmap_sem);
return ret;
}
-static int do_mlockall(int flags)
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this. If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack. If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
+static int apply_mlockall_flags(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
+ vm_flags_t to_add = 0;
- if (flags & MCL_FUTURE)
+ current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+ if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
- else
- current->mm->def_flags &= ~VM_LOCKED;
- if (flags == MCL_FUTURE)
- goto out;
+
+ if (flags & MCL_ONFAULT)
+ current->mm->def_flags |= VM_LOCKONFAULT;
+
+ if (!(flags & MCL_CURRENT))
+ goto out;
+ }
+
+ if (flags & MCL_CURRENT) {
+ to_add |= VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ to_add |= VM_LOCKONFAULT;
+ }
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
vm_flags_t newflags;
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (flags & MCL_CURRENT)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ newflags |= to_add;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
SYSCALL_DEFINE1(mlockall, int, flags)
{
unsigned long lock_limit;
- int ret = -EINVAL;
+ int ret;
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
- goto out;
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
+ return -EINVAL;
- ret = -EPERM;
if (!can_do_mlock())
- goto out;
+ return -EPERM;
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
- ret = do_mlockall(flags);
+ ret = apply_mlockall_flags(flags);
up_write(¤t->mm->mmap_sem);
if (!ret && (flags & MCL_CURRENT))
mm_populate(0, TASK_SIZE);
-out:
+
return ret;
}
int ret;
down_write(¤t->mm->mmap_sem);
- ret = do_mlockall(0);
+ ret = apply_mlockall_flags(0);
up_write(¤t->mm->mmap_sem);
return ret;
}
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
/* Do simple checking here so the lower-level routines won't have
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
- unsigned long retval = -EBADF;
+ unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
- goto out;
+ return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
retval = -EINVAL;
out_fput:
if (file)
fput(file);
-out:
return retval;
}
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- if (a.offset & ~PAGE_MASK)
+ if (offset_in_page(a.offset))
return -EINVAL;
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
}
/* Clear old maps */
- error = -ENOMEM;
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len))
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= ~VM_LOCKED;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
}
if (file)
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
if (addr > TASK_SIZE - len)
return -ENOMEM;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return -EINVAL;
addr = arch_rebalance_pgtables(addr, len);
return vma;
rb_node = mm->mm_rb.rb_node;
- vma = NULL;
while (rb_node) {
struct vm_area_struct *tmp;
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
- /* Ok, everything looks good - let it rip */
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
}
*/
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
if (!(vma->vm_flags & VM_GROWSUP))
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- vma->vm_mm->highest_vm_end = address;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ mm->highest_vm_end = address;
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(vma->vm_mm);
+ validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
+ struct mm_struct *mm = vma->vm_mm;
int error;
/*
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags,
+ vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
- validate_mm(vma->vm_mm);
+ validate_mm(mm);
return error;
}
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
len = PAGE_ALIGN(len);
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (error & ~PAGE_MASK)
+ if (offset_in_page(error))
return error;
error = mlock_future_check(mm, mm->def_flags, len);
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_operations_struct *ops,
- void *priv)
+ unsigned long vm_flags, void *priv,
+ const struct vm_operations_struct *ops)
{
int ret;
struct vm_area_struct *vma;
unsigned long addr, unsigned long len,
unsigned long vm_flags, const struct vm_special_mapping *spec)
{
- return __install_special_mapping(mm, addr, len, vm_flags,
- &special_mapping_vmops, (void *)spec);
+ return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+ &special_mapping_vmops);
}
int install_special_mapping(struct mm_struct *mm,
unsigned long vm_flags, struct page **pages)
{
struct vm_area_struct *vma = __install_special_mapping(
- mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
- (void *)pages);
+ mm, addr, len, vm_flags, (void *)pages,
+ &legacy_special_mapping_vmops);
return PTR_ERR_OR_ZERO(vma);
}
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
if (!new_pmd)
break;
if (pmd_trans_huge(*old_pmd)) {
- int err = 0;
if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
- err = move_huge_pmd(vma, new_vma, old_addr,
+ moved = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (need_rmap_locks)
anon_vma_unlock_write(vma->anon_vma);
+ if (moved) {
+ need_flush = true;
+ continue;
+ }
}
- if (err > 0) {
- need_flush = true;
- continue;
- } else if (!err) {
- split_huge_page_pmd(vma, old_addr, old_pmd);
- }
+ split_huge_pmd(vma, old_pmd, old_addr);
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
unsigned long charged = 0;
unsigned long map_flags;
- if (new_addr & ~PAGE_MASK)
+ if (offset_in_page(new_addr))
goto out;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (ret & ~PAGE_MASK)
+ if (offset_in_page(ret))
goto out1;
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
- if (!(ret & ~PAGE_MASK))
+ if (!(offset_in_page(ret)))
goto out;
out1:
vm_unacct_memory(charged);
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return ret;
old_len = PAGE_ALIGN(old_len);
vma->vm_pgoff +
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
- if (new_addr & ~PAGE_MASK) {
+ if (offset_in_page(new_addr)) {
ret = new_addr;
goto out;
}
ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
}
out:
- if (ret & ~PAGE_MASK) {
+ if (offset_in_page(ret)) {
vm_unacct_memory(charged);
locked = 0;
}
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
- if (start & ~PAGE_MASK)
+ if (offset_in_page(start))
goto out;
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
goto out;
return;
last = rb_entry(lastp, struct vm_region, vm_rb);
- BUG_ON(unlikely(last->vm_end <= last->vm_start));
- BUG_ON(unlikely(last->vm_top < last->vm_end));
+ BUG_ON(last->vm_end <= last->vm_start);
+ BUG_ON(last->vm_top < last->vm_end);
while ((p = rb_next(lastp))) {
region = rb_entry(p, struct vm_region, vm_rb);
last = rb_entry(lastp, struct vm_region, vm_rb);
- BUG_ON(unlikely(region->vm_end <= region->vm_start));
- BUG_ON(unlikely(region->vm_top < region->vm_end));
- BUG_ON(unlikely(region->vm_start < last->vm_top));
+ BUG_ON(region->vm_end <= region->vm_start);
+ BUG_ON(region->vm_top < region->vm_end);
+ BUG_ON(region->vm_start < last->vm_top);
lastp = p;
}
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
- if (a.offset & ~PAGE_MASK)
+ if (offset_in_page(a.offset))
return -EINVAL;
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
goto erase_whole_vma;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- if (start & ~PAGE_MASK)
+ if (offset_in_page(start))
return -EINVAL;
- if (end != vma->vm_end && end & ~PAGE_MASK)
+ if (end != vma->vm_end && offset_in_page(end))
return -EINVAL;
if (start != vma->vm_start && end != vma->vm_end) {
ret = split_vma(mm, vma, start, 1);
if (old_len == 0 || new_len == 0)
return (unsigned long) -EINVAL;
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return -EINVAL;
if (flags & MREMAP_FIXED && new_addr != addr)
return t;
}
+/*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+static inline bool is_sysrq_oom(struct oom_control *oc)
+{
+ return oc->order == -1;
+}
+
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask)
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (oc->order != -1)
+ if (!is_sysrq_oom(oc))
return OOM_SCAN_ABORT;
}
if (!task->mm)
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && oc->order != -1)
+ if (task_will_free_mem(task) && !is_sysrq_oom(oc))
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
- cpuset_print_task_mems_allowed(current);
- task_unlock(current);
+ cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
mem_cgroup_print_oom_info(memcg, p);
oom_killer_disabled = false;
}
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
if (__ratelimit(&oom_rs))
dump_header(oc, p, memcg);
- task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
- task_unlock(p);
/*
* If any of p's children has a different mm and is eligible for kill,
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
- if (child->mm == p->mm)
+ if (process_shares_mm(child, p->mm))
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
victim = p;
}
- /* mm cannot safely be dereferenced after task_unlock(victim) */
+ /* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
+ atomic_inc(&mm->mm_count);
+ /*
+ * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
+ * the OOM victim from depleting the memory reserves from the user
+ * space under its control.
+ */
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
* pending fatal signal.
*/
rcu_read_lock();
- for_each_process(p)
- if (p->mm == mm && !same_thread_group(p, victim) &&
- !(p->flags & PF_KTHREAD)) {
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- continue;
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, victim))
+ continue;
+ if (unlikely(p->flags & PF_KTHREAD))
+ continue;
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
- task_lock(p); /* Protect ->comm from prctl() */
- pr_err("Kill process %d (%s) sharing same memory\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
- }
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ }
rcu_read_unlock();
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ mmdrop(mm);
put_task_struct(victim);
}
#undef K
return;
}
/* Do not panic for oom kills triggered by sysrq */
- if (oc->order == -1)
+ if (is_sysrq_oom(oc))
return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p && oc->order != -1) {
+ if (!p && !is_sysrq_oom(oc)) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
WARN_ON(!mutex_is_locked(&pm_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~GFP_IOFS;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
}
bool pm_suspended_storage(void)
{
- if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
return false;
return true;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-int pageblock_order __read_mostly;
+unsigned int pageblock_order __read_mostly;
#endif
static void __free_pages_ok(struct page *page, unsigned int order);
#endif
};
+compound_page_dtor * const compound_page_dtors[] = {
+ NULL,
+ free_compound_page,
+#ifdef CONFIG_HUGETLB_PAGE
+ free_huge_page,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ free_transhuge_page,
+#endif
+};
+
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
- * The first PAGE_SIZE page is called the "head page".
+ * The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
- * The remaining PAGE_SIZE pages are called "tail pages".
+ * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+ * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
- * All pages have PG_compound set. All tail pages have their ->first_page
- * pointing at the head page.
+ * The first tail page's ->compound_dtor holds the offset in array of compound
+ * page destructors. See compound_page_dtors.
*
- * The first tail page's ->lru.next holds the address of the compound page's
- * put_page() function. Its ->lru.prev holds the order of allocation.
+ * The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
{
__free_pages_ok(page, compound_order(page));
}
-void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
- set_compound_page_dtor(page, free_compound_page);
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
- p->first_page = page;
- /* Make sure p->first_page is always valid for PageTail() */
- smp_wmb();
- __SetPageTail(p);
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- int max_order = MAX_ORDER;
+ unsigned int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
* pageblock. Without this, pageblock isolation
* could cause incorrect freepage accounting.
*/
- max_order = min(MAX_ORDER, pageblock_order + 1);
+ max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
} else {
__mod_zone_freepage_state(zone, 1 << order, migratetype);
}
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
- /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
} while (--to_free && --batch_free && !list_empty(list));
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
- if (!IS_ENABLED(CONFIG_DEBUG_VM))
- return 0;
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
+ }
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set", 0);
- return 1;
+ goto out;
}
- if (unlikely(page->first_page != head_page)) {
- bad_page(page, "first_page not consistent", 0);
- return 1;
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent", 0);
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
+
+ /* Avoid false-positive PageTail() */
+ INIT_LIST_HEAD(&page->lru);
+
SetPageReserved(page);
}
}
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][4] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
int migratetype)
{
struct page *page;
- unsigned long order;
+ unsigned int order;
int pages_moved = 0;
#ifndef CONFIG_HOLES_IN_ZONE
static void steal_suitable_fallback(struct zone *zone, struct page *page,
int start_type)
{
- int current_order = page_order(page);
+ unsigned int current_order = page_order(page);
int pages;
/* Take ownership for orders >= pageblock_order */
*can_steal = false;
for (i = 0;; i++) {
fallback_mt = fallbacks[migratetype][i];
- if (fallback_mt == MIGRATE_RESERVE)
+ if (fallback_mt == MIGRATE_TYPES)
break;
if (list_empty(&area->free_list[fallback_mt]))
return -1;
}
+/*
+ * Reserve a pageblock for exclusive use of high-order atomic allocations if
+ * there are no empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+ unsigned int alloc_order)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+ * Check is race-prone but harmless.
+ */
+ max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ if (mt != MIGRATE_HIGHATOMIC &&
+ !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ */
+static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ /* Preserve at least one pageblock */
+ if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+
+ if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ continue;
+
+ page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
+ struct page, lru);
+
+ /*
+ * It should never happen but changes to locking could
+ * inadvertently allow a per-cpu drain to add pages
+ * to MIGRATE_HIGHATOMIC while unreserving so be safe
+ * and watch for underflows.
+ */
+ zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ set_pageblock_migratetype(page, ac->migratetype);
+ move_freepages_block(zone, page, ac->migratetype);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+ int migratetype, gfp_t gfp_flags)
{
struct page *page;
-retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
-
- if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+ if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
if (!page)
page = __rmqueue_fallback(zone, order, migratetype);
-
- /*
- * Use MIGRATE_RESERVE rather than fail an allocation. goto
- * is used because __rmqueue_smallest is an inline function
- * and we want just one call site
- */
- if (!page) {
- migratetype = MIGRATE_RESERVE;
- goto retry_reserve;
- }
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page = __rmqueue(zone, order, migratetype, 0);
if (unlikely(page == NULL))
break;
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int alloc_flags, int migratetype)
{
unsigned long flags;
struct page *page;
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
+
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype, gfp_flags);
spin_unlock(&zone->lock);
if (!page)
goto failed;
struct fault_attr attr;
bool ignore_gfp_highmem;
- bool ignore_gfp_wait;
+ bool ignore_gfp_reclaim;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = true,
+ .ignore_gfp_reclaim = true,
.ignore_gfp_highmem = true,
.min_order = 1,
};
return false;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
return false;
- if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
return should_fail(&fail_page_alloc.attr, 1 << order);
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_wait))
+ &fail_page_alloc.ignore_gfp_reclaim))
goto fail;
if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem))
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return true if free pages are above 'mark'. This takes into account the order
- * of the allocation.
+ * Return true if free base pages are above 'mark'. For high-order checks it
+ * will return true of the order-0 watermark is reached and there is at least
+ * one free page of a suitable size. Checking now avoids taking the zone lock
+ * to check in the allocation paths if no pages are free.
*/
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, int alloc_flags,
long free_pages)
{
- /* free_pages may go negative - that's OK */
long min = mark;
int o;
- long free_cma = 0;
+ const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+ /* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
+
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- if (alloc_flags & ALLOC_HARDER)
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ free_pages -= z->nr_reserved_highatomic;
+ else
min -= min / 4;
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
- free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
- if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+ /*
+ * Check watermarks for an order-0 allocation request. If these
+ * are not met, then a high-order request also cannot go ahead
+ * even if a suitable page happened to be free.
+ */
+ if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return false;
- for (o = 0; o < order; o++) {
- /* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
- /* Require fewer higher order pages to be free */
- min >>= 1;
+ /* If this is an order-0 request then the watermark is fine */
+ if (!order)
+ return true;
+
+ /* For a high-order request, check at least one suitable page is free */
+ for (o = order; o < MAX_ORDER; o++) {
+ struct free_area *area = &z->free_area[o];
+ int mt;
- if (free_pages <= min)
- return false;
+ if (!area->nr_free)
+ continue;
+
+ if (alloc_harder)
+ return true;
+
+ for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+ if (!list_empty(&area->free_list[mt]))
+ return true;
+ }
+
+#ifdef CONFIG_CMA
+ if ((alloc_flags & ALLOC_CMA) &&
+ !list_empty(&area->free_list[MIGRATE_CMA])) {
+ return true;
+ }
+#endif
}
- return true;
+ return false;
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags)
+ unsigned long mark, int classzone_idx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
free_pages);
}
#ifdef CONFIG_NUMA
-/*
- * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
- * skip over zones that are not allowed by the cpuset, or that have
- * been recently (in last second) found to be nearly full. See further
- * comments in mmzone.h. Reduces cache footprint of zonelist scans
- * that have to skip over a lot of full or unallowed zones.
- *
- * If the zonelist cache is present in the passed zonelist, then
- * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_MEMORY].)
- *
- * If the zonelist cache is not available for this zonelist, does
- * nothing and returns NULL.
- *
- * If the fullzones BITMAP in the zonelist cache is stale (more than
- * a second since last zap'd) then we zap it out (clear its bits.)
- *
- * We hold off even calling zlc_setup, until after we've checked the
- * first zone in the zonelist, on the theory that most allocations will
- * be satisfied from that first zone, so best to examine that zone as
- * quickly as we can.
- */
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- nodemask_t *allowednodes; /* zonelist_cache approximation */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return NULL;
-
- if (time_after(jiffies, zlc->last_full_zap + HZ)) {
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- zlc->last_full_zap = jiffies;
- }
-
- allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
- &cpuset_current_mems_allowed :
- &node_states[N_MEMORY];
- return allowednodes;
-}
-
-/*
- * Given 'z' scanning a zonelist, run a couple of quick checks to see
- * if it is worth looking at further for free memory:
- * 1) Check that the zone isn't thought to be full (doesn't have its
- * bit set in the zonelist_cache fullzones BITMAP).
- * 2) Check that the zones node (obtained from the zonelist_cache
- * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
- * Return true (non-zero) if zone is worth looking at further, or
- * else return false (zero) if it is not.
- *
- * This check -ignores- the distinction between various watermarks,
- * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
- * found to be full for any variation of these watermarks, it will
- * be considered full for up to one second by all requests, unless
- * we are so low on memory on all allowed nodes that we are forced
- * into the second scan of the zonelist.
- *
- * In the second scan we ignore this zonelist cache and exactly
- * apply the watermarks to all zones, even it is slower to do so.
- * We are low on memory in the second scan, and should leave no stone
- * unturned looking for a free page.
- */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
- nodemask_t *allowednodes)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- int i; /* index of *z in zonelist zones */
- int n; /* node that zone *z is on */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return 1;
-
- i = z - zonelist->_zonerefs;
- n = zlc->z_to_n[i];
-
- /* This zone is worth trying if it is allowed but not full */
- return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
-}
-
-/*
- * Given 'z' scanning a zonelist, set the corresponding bit in
- * zlc->fullzones, so that subsequent attempts to allocate a page
- * from that zone don't waste time re-examining it.
- */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- int i; /* index of *z in zonelist zones */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return;
-
- i = z - zonelist->_zonerefs;
-
- set_bit(i, zlc->fullzones);
-}
-
-/*
- * clear all zones full, called after direct reclaim makes progress so that
- * a zone that was recently full is not skipped over for up to a second
- */
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return;
-
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-}
-
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return local_zone->node == zone->node;
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
-
#else /* CONFIG_NUMA */
-
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
- return NULL;
-}
-
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
- nodemask_t *allowednodes)
-{
- return 1;
-}
-
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
-}
-
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
-}
-
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return true;
{
return true;
}
-
#endif /* CONFIG_NUMA */
static void reset_alloc_batches(struct zone *preferred_zone)
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE);
int nr_fair_skipped = 0;
bool zonelist_rescan;
ac->nodemask) {
unsigned long mark;
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed(zone, gfp_mask))
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
- * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if (consider_zone_dirty && !zone_dirty_ok(zone))
+ if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (IS_ENABLED(CONFIG_NUMA) &&
- !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup if there are multiple nodes
- * and before considering the first zone allowed
- * by the cpuset.
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
-
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zone, zone))
- goto this_zone_full;
-
- /*
- * As we may have just activated ZLC, check if the first
- * eligible zone has failed zone_reclaim recently.
- */
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
ac->classzone_idx, alloc_flags))
goto try_this_zone;
- /*
- * Failed to reclaim enough to meet watermark.
- * Only mark the zone full if checking the min
- * watermark or if we failed to reclaim just
- * 1<<order pages or else the page allocator
- * fastpath will prematurely mark zones full
- * when the watermark is between the low and
- * min watermarks.
- */
- if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
- ret == ZONE_RECLAIM_SOME)
- goto this_zone_full;
-
continue;
}
}
try_this_zone:
page = buffered_rmqueue(ac->preferred_zone, zone, order,
- gfp_mask, ac->migratetype);
+ gfp_mask, alloc_flags, ac->migratetype);
if (page) {
if (prep_new_page(page, order, gfp_mask, alloc_flags))
goto try_this_zone;
+
+ /*
+ * If this is a high-order atomic allocation then check
+ * if the pageblock should be reserved for the future
+ */
+ if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+ reserve_highatomic_pageblock(page, zone, order);
+
return page;
}
-this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
- zlc_mark_zone_full(zonelist, z);
}
/*
zonelist_rescan = true;
}
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- zonelist_rescan = true;
- }
-
if (zonelist_rescan)
goto zonelist_scan;
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
if (test_thread_flag(TIF_MEMDIE) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
current->comm, order, gfp_mask);
dump_stack();
if (unlikely(!(*did_some_progress)))
return NULL;
- /* After successful reclaim, reconsider all zones for allocation */
- if (IS_ENABLED(CONFIG_NUMA))
- zlc_clear_zones_full(ac->zonelist);
-
retry:
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
/*
* If an allocation failed after direct reclaim, it could be because
- * pages are pinned on the per-cpu lists. Drain them and try again
+ * pages are pinned on the per-cpu lists or in high alloc reserves.
+ * Shrink them them and try again
*/
if (!page && !drained) {
+ unreserve_highatomic_pageblock(ac);
drain_all_pages(NULL);
drained = true;
goto retry;
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (atomic) {
+ if (gfp_mask & __GFP_ATOMIC) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
}
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+ return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
return NULL;
}
+ /*
+ * We also sanity check to catch abuse of atomic reserves being used by
+ * callers that are not in atomic context.
+ */
+ if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+ (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+ gfp_mask &= ~__GFP_ATOMIC;
+
/*
* If this allocation cannot block and it is for a specific node, then
* fail early. There's no need to wakeup kswapd or retry for a
* speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
goto nopage;
retry:
- if (!(gfp_mask & __GFP_NO_KSWAPD))
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
}
}
- /* Atomic allocations - we can't balance anything */
- if (!wait) {
+ /* Caller is not willing to reclaim, we can't balance anything */
+ if (!can_direct_reclaim) {
/*
* All existing users of the deprecated __GFP_NOFAIL are
* blockable, so warn of any new users that actually allow this
goto got_pg;
/* Checks for THP-specific high-order allocations */
- if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+ if (is_thp_gfp_mask(gfp_mask)) {
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
* fault, so use asynchronous memory compaction for THP unless it is
* khugepaged trying to collapse.
*/
- if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
- (current->flags & PF_KTHREAD))
+ if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
migration_mode = MIGRATE_SYNC_LIGHT;
/* Try direct reclaim and then allocating */
lockdep_trace_alloc(gfp_mask);
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/* We set it here, as __alloc_pages_slowpath might have changed it */
ac.zonelist = zonelist;
+
+ /* Dirty zone balancing only done in the fast path */
+ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
/* The preferred zone is used for statistics later */
preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
ac.nodemask ? : &cpuset_current_mems_allowed,
* complete.
*/
alloc_mask = memalloc_noio_flags(gfp_mask);
+ ac.spread_dirty_pages = false;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
}
struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- struct mem_cgroup *memcg = NULL;
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
page = alloc_pages(gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
+ if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- struct mem_cgroup *memcg = NULL;
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
page = alloc_pages_node(nid, gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
+ if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
*/
void __free_kmem_pages(struct page *page, unsigned int order)
{
- memcg_kmem_uncharge_pages(page, order);
+ memcg_kmem_uncharge(page, order);
__free_pages(page, order);
}
}
}
-static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+static void *make_alloc_exact(unsigned long addr, unsigned int order,
+ size_t size)
{
if (addr) {
unsigned long alloc_end = addr + (PAGE_SIZE << order);
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
- unsigned order = get_order(size);
+ unsigned int order = get_order(size);
struct page *p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
[MIGRATE_UNMOVABLE] = 'U',
[MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RESERVE] = 'R',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
}
for_each_populated_zone(zone) {
- unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned int order;
+ unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
nodemask_t used_mask;
int local_node, prev_node;
struct zonelist *zonelist;
- int order = current_zonelist_order;
+ unsigned int order = current_zonelist_order;
/* initialize zonelists */
for (i = 0; i < MAX_ZONELISTS; i++) {
build_thisnode_zonelists(pgdat);
}
-/* Construct the zonelist performance cache - see further mmzone.h */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
- struct zonelist *zonelist;
- struct zonelist_cache *zlc;
- struct zoneref *z;
-
- zonelist = &pgdat->node_zonelists[0];
- zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- for (z = zonelist->_zonerefs; z->zone; z++)
- zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
-}
-
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
zonelist->_zonerefs[j].zone_idx = 0;
}
-/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
- pgdat->node_zonelists[0].zlcache_ptr = NULL;
-}
-
#endif /* CONFIG_NUMA */
/*
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- build_zonelist_cache(self);
}
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
- build_zonelist_cache(pgdat);
}
/*
/*
* We now know the "local memory node" for each node--
* i.e., the node of the first zone in the generic zonelist.
- * Set up numa_mem percpu variable for on-line cpus. During
- * boot, only the boot cpu should be on-line; we'll init the
- * secondary cpus' numa_mem as they come on-line. During
- * node/memory hotplug, we'll fixup all on-line cpus.
+ * Set up numa_mem percpu variable for all possible cpus
+ * if associated node has been onlined.
*/
- if (cpu_online(cpu))
+ if (node_online(cpu_to_node(cpu)))
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+ else
+ set_cpu_numa_mem(cpu, NUMA_NO_NODE);
#endif
}
return ffz(~size);
}
-/*
- * Check if a pageblock contains reserved pages
- */
-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
- return 1;
- }
- return 0;
-}
-
-/*
- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on min_wmark_pages(zone). The memory within
- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
- * higher will lead to a bigger reserve which will get freed as contiguous
- * blocks as reclaim kicks in
- */
-static void setup_zone_migrate_reserve(struct zone *zone)
-{
- unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
- struct page *page;
- unsigned long block_migratetype;
- int reserve;
- int old_reserve;
-
- /*
- * Get the start pfn, end pfn and the number of blocks to reserve
- * We have to be careful to be aligned to pageblock_nr_pages to
- * make sure that we always check pfn_valid for the first page in
- * the block.
- */
- start_pfn = zone->zone_start_pfn;
- end_pfn = zone_end_pfn(zone);
- start_pfn = roundup(start_pfn, pageblock_nr_pages);
- reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
- pageblock_order;
-
- /*
- * Reserve blocks are generally in place to help high-order atomic
- * allocations that are short-lived. A min_free_kbytes value that
- * would result in more than 2 reserve blocks for atomic allocations
- * is assumed to be in place to help anti-fragmentation for the
- * future allocation of hugepages at runtime.
- */
- reserve = min(2, reserve);
- old_reserve = zone->nr_migrate_reserve_block;
-
- /* When memory hot-add, we almost always need to do nothing */
- if (reserve == old_reserve)
- return;
- zone->nr_migrate_reserve_block = reserve;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
- return;
-
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
-
- /* Watch out for overlapping nodes */
- if (page_to_nid(page) != zone_to_nid(zone))
- continue;
-
- block_migratetype = get_pageblock_migratetype(page);
-
- /* Only test what is necessary when the reserves are not met */
- if (reserve > 0) {
- /*
- * Blocks with reserved pages will never free, skip
- * them.
- */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
- /* If this block is reserved, account for it */
- if (block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
-
- /* Suitable for reserving if this block is movable */
- if (block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page,
- MIGRATE_RESERVE);
- move_freepages_block(zone, page,
- MIGRATE_RESERVE);
- reserve--;
- continue;
- }
- } else if (!old_reserve) {
- /*
- * At boot time we don't need to scan the whole zone
- * for turning off MIGRATE_RESERVE.
- */
- break;
- }
-
- /*
- * If the reserve is met and this is a previous reserved block,
- * take it back
- */
- if (block_migratetype == MIGRATE_RESERVE) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(zone, page, MIGRATE_MOVABLE);
- }
- }
-}
-
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
- * kernel allocations are made. Later some blocks near
- * the start are marked MIGRATE_RESERVE by
- * setup_zone_migrate_reserve()
+ * kernel allocations are made.
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size,
- enum memmap_context context)
+ unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn,
- size, MEMMAP_EARLY);
+ ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
+ unsigned long __maybe_unused offset = 0;
+
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
- pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+ pgdat->node_mem_map = map + offset;
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+ mem_map -= offset;
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
*/
required_movablecore =
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
required_kernelcore = max(required_kernelcore, corepages);
}
- /* If kernelcore was not specified, there is no ZONE_MOVABLE */
- if (!required_kernelcore)
+ /*
+ * If kernelcore was not specified or kernelcore size is larger
+ * than totalpages, there is no ZONE_MOVABLE.
+ */
+ if (!required_kernelcore || required_kernelcore >= totalpages)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
unsigned migratetype)
{
unsigned long outer_start, outer_end;
- int ret = 0, order;
+ unsigned int order;
+ int ret = 0;
struct compact_control cc = {
.nr_migratepages = 0,
bool referenced = false;
if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ pmd = page_check_address_pmd(page, mm, addr, &ptl);
if (pmd) {
referenced = pmdp_clear_young_notify(vma, addr, pmd);
spin_unlock(ptl);
if (!walk->pte_entry)
continue;
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
- PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
PCPU_SETUP_BUG_ON(!base_addr);
- PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
- PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
continue;
for (group = 0; group < nr_groups; group++) {
}
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- /* tlb flush only to serialize against gup-fast */
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
page = list_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping))) {
+ mapping_gfp_constraint(mapping, GFP_KERNEL))) {
read_cache_pages_invalidate_page(mapping, page);
continue;
}
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping))) {
+ mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
- nr_to_read = max_sane_readahead(nr_to_read);
+ nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
while (nr_to_read) {
int err;
return 0;
}
-#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
- */
-unsigned long max_sane_readahead(unsigned long nr)
-{
- return min(nr, MAX_READAHEAD);
-}
-
/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
- unsigned long max = max_sane_readahead(ra->ra_pages);
+ unsigned long max = ra->ra_pages;
pgoff_t prev_offset;
/*
anon_vma_unlock_read(anon_vma);
}
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address = __vma_address(page, vma);
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
- return address;
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
static void percpu_flush_tlb_batch_pages(void *data)
{
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ pmd = page_check_address_pmd(page, mm, address, &ptl);
if (!pmd)
return SWAP_AGAIN;
return SWAP_FAIL; /* To break the loop */
}
- /* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+
spin_unlock(ptl);
} else {
pte_t *pte;
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
+
pte_unmap_unlock(pte, ptl);
}
};
*vm_flags = 0;
+
if (!page_mapped(page))
return 0;
* over the call to page_add_new_anon_rmap.
*/
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page->index != linear_page_index(vma, address));
+ BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
#endif
}
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- do_page_add_anon_rmap(page, vma, address, 0);
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+ struct vm_area_struct *vma, unsigned long address, int flags)
{
- int first = atomic_inc_and_test(&page->_mapcount);
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (PageTransCompound(page)) {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (compound) {
+ atomic_t *mapcount;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ /* Anon THP always mapped first with PMD */
+ first = 0;
+ VM_BUG_ON_PAGE(!page_mapcount(page), page);
+ atomic_inc(&page->_mapcount);
+ }
+ } else {
+ VM_BUG_ON_PAGE(compound, page);
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
if (first) {
+ int nr = compound ? hpage_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON_PAGE(!PageLocked(page), page);
+
/* address might be in next vma when migration races vma_adjust */
if (first)
- __page_set_anon_rmap(page, vma, address, exclusive);
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
+ int nr = compound ? hpage_nr_pages(page) : 1;
+
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
- atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
memcg = mem_cgroup_begin_page_stat(page);
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
goto out;
+ }
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page)))
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
/*
mem_cgroup_end_page_stat(memcg);
}
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * themi are still mapped.
+ */
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ } else {
+ nr = HPAGE_PMD_NR;
+ }
+
+ if (nr) {
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ deferred_split_huge_page(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
{
if (!PageAnon(page)) {
+ VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
page_remove_file_rmap(page);
return;
}
+ if (compound)
+ return page_remove_anon_compound_rmap(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
+ __dec_zone_page_state(page, NR_ANON_PAGES);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ /* munlock has nothing to gain from examining un-locked vmas */
+ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+ goto out;
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
* skipped over this mm) then we should reactivate it.
*/
if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED)
- goto out_mlock;
-
+ if (vma->vm_flags & VM_LOCKED) {
+ /* Holding pte lock, we do *not* need mmap_sem here */
+ mlock_vma_page(page);
+ ret = SWAP_MLOCK;
+ goto out_unmap;
+ }
if (flags & TTU_MUNLOCK)
goto out_unmap;
}
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (!PageHuge(page)) {
+ if (PageHuge(page)) {
+ hugetlb_count_sub(1 << compound_order(page), mm);
+ } else {
if (PageAnon(page))
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_ANONPAGES);
else
dec_mm_counter(mm, MM_FILEPAGES);
+ } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, pte_write(pteval));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, address, pte, swp_pte);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
- if (PageSwapCache(page)) {
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- if (swap_duplicate(entry) < 0) {
- set_pte_at(mm, address, pte, pteval);
- ret = SWAP_FAIL;
- goto out_unmap;
- }
- if (list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- if (list_empty(&mm->mmlist))
- list_add(&mm->mmlist, &init_mm.mmlist);
- spin_unlock(&mmlist_lock);
- }
+ if (!PageDirty(page) && (flags & TTU_FREE)) {
+ /* It's a freeable page by MADV_FREE */
dec_mm_counter(mm, MM_ANONPAGES);
- inc_mm_counter(mm, MM_SWAPENTS);
- } else if (IS_ENABLED(CONFIG_MIGRATION)) {
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- BUG_ON(!(flags & TTU_MIGRATION));
- entry = make_migration_entry(page, pte_write(pteval));
+ goto discard;
}
+
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ if (!PageDirty(page))
+ SetPageDirty(page);
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION)) {
- /* Establish migration entry for a file page */
- swp_entry_t entry;
- entry = make_migration_entry(page, pte_write(pteval));
- set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
dec_mm_counter(mm, MM_FILEPAGES);
- page_remove_rmap(page);
+discard:
+ page_remove_rmap(page, false);
page_cache_release(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+ if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
out:
return ret;
-
-out_mlock:
- pte_unmap_unlock(pte, ptl);
-
-
- /*
- * We need mmap_sem locking, Otherwise VM_LOCKED check makes
- * unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
- * if trylock failed, the page remain in evictable lru and later
- * vmscan could retry to move the page to unevictable lru if the
- * page is actually mlocked.
- */
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- if (vma->vm_flags & VM_LOCKED) {
- mlock_vma_page(page);
- ret = SWAP_MLOCK;
- }
- up_read(&vma->vm_mm->mmap_sem);
- }
- return ret;
}
bool is_vma_temporary_stack(struct vm_area_struct *vma)
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ cond_resched();
+
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
+ cond_resched();
+
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(&page->_mapcount);
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0);
+ atomic_set(compound_mapcount_ptr(page), 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include "internal.h"
+
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+ false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
if (error) {
if (error != -ENOMEM)
error = 0;
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
} else
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
page_cache_release(page);
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage, true);
+ mem_cgroup_replace_page(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
* "repeat": reading a hole and writing should succeed.
*/
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
delete_from_swap_cache(page);
}
}
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
spin_lock(&info->lock);
info->swapped--;
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
goto decused;
}
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
}
/*
- * Construct gfp mask to allocate from a specific node but do not invoke reclaim
- * or warn about failures.
+ * Construct gfp mask to allocate from a specific node but do not direct reclaim
+ * or warn about failures. kswapd may still wake to reclaim in the background.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
}
#endif
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- if (memcg_charge_slab(cachep, flags, cachep->gfporder))
- return NULL;
-
page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
- memcg_uncharge_slab(cachep, cachep->gfporder);
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
+ if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
+ __free_pages(page, cachep->gfporder);
+ return NULL;
+ }
+
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (page_is_pfmemalloc(page))
pfmemalloc_active = true;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_pages(page, cachep->gfporder);
- memcg_uncharge_slab(cachep, cachep->gfporder);
+ __free_kmem_pages(page, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
freelist = page->freelist;
slab_destroy_debugcheck(cachep, page);
- if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
- struct rcu_head *head;
-
- /*
- * RCU free overloads the RCU head over the LRU.
- * slab_page has been overloeaded over the LRU,
- * however it is not used from now on so that
- * we can use it safely.
- */
- head = (void *)&page->rcu_head;
- call_rcu(head, kmem_rcu_free);
-
- } else {
+ if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+ call_rcu(&page->rcu_head, kmem_rcu_free);
+ else
kmem_freepages(cachep, page);
- }
/*
* From now on, we don't use freelist
offset *= cachep->colour_off;
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
/*
cache_init_objs(cachep, page);
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
check_irq_off();
spin_lock(&n->list_lock);
opps1:
kmem_freepages(cachep, page);
failed:
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
return 0;
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
- might_sleep_if(flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(flags));
#if DEBUG
kmem_flagcheck(cachep, flags);
#endif
*/
struct page *page;
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
kmem_flagcheck(cache, flags);
page = kmem_getpages(cache, local_flags, numa_mem_id());
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
if (page) {
/*
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller,
+ bool irq_off_needed)
{
unsigned long save_flags;
void *objp;
+ /* Compiler need to remove irq_off_needed branch statements */
+ BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
+ if (irq_off_needed)
+ local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
- local_irq_restore(save_flags);
+ if (irq_off_needed)
+ local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ void *ret = slab_alloc(cachep, flags, _RET_IP_, true);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
- __kmem_cache_free_bulk(s, size, p);
-}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-
+/* Note that interrupts must be enabled when calling this function. */
bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *x = p[i] = slab_alloc(s, flags, _RET_IP_, false);
+
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ }
+ local_irq_enable();
+ return true;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
{
void *ret;
- ret = slab_alloc(cachep, flags, _RET_IP_);
+ ret = slab_alloc(cachep, flags, _RET_IP_, true);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = slab_alloc(cachep, flags, caller);
+ ret = slab_alloc(cachep, flags, caller, true);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
-/**
- * kmem_cache_free - Deallocate an object
- * @cachep: The cache the allocation was from.
- * @objp: The previously allocated object.
- *
- * Free an object which was previously allocated from this
- * cache.
- */
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+/* Caller is responsible for disabling local IRQs */
+static __always_inline void __kmem_cache_free(struct kmem_cache *cachep,
+ void *objp, bool irq_off_needed)
{
unsigned long flags;
+
+ /* Compiler need to remove irq_off_needed branch statements */
+ BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- local_irq_save(flags);
+ if (irq_off_needed)
+ local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
+ if (irq_off_needed)
+ local_irq_restore(flags);
+}
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ __kmem_cache_free(cachep, objp, true);
trace_kmem_cache_free(_RET_IP_, objp);
}
EXPORT_SYMBOL(kmem_cache_free);
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++)
+ __kmem_cache_free(s, p[i], false);
+ local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
list_for_each_entry(iter, &(root)->memcg_params.list, \
memcg_params.list)
-#define for_each_memcg_cache_safe(iter, tmp, root) \
- list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
- memcg_params.list)
-
static inline bool is_root_cache(struct kmem_cache *s)
{
return s->memcg_params.is_root_cache;
return s->memcg_params.root_cache;
}
-static __always_inline int memcg_charge_slab(struct kmem_cache *s,
- gfp_t gfp, int order)
+static __always_inline int memcg_charge_slab(struct page *page,
+ gfp_t gfp, int order,
+ struct kmem_cache *s)
{
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
-}
-
-static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
- if (!memcg_kmem_enabled())
- return;
- if (is_root_cache(s))
- return;
- memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
+ return __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
}
extern void slab_init_memcg_params(struct kmem_cache *);
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
-#define for_each_memcg_cache_safe(iter, tmp, root) \
- for ((void)(iter), (void)(tmp), (void)(root); 0; )
static inline bool is_root_cache(struct kmem_cache *s)
{
return s;
}
-static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
+ struct kmem_cache *s)
{
return 0;
}
-static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-}
-
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
return ALIGN(align, sizeof(void *));
}
-static struct kmem_cache *
-do_kmem_cache_create(const char *name, size_t object_size, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+static struct kmem_cache *create_cache(const char *name,
+ size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s = NULL;
const char *cache_name;
int err;
err = kmem_cache_sanity_check(name, size);
if (err) {
- s = NULL; /* suppress uninit var warning */
goto out_unlock;
}
goto out_unlock;
}
- s = do_kmem_cache_create(cache_name, size, size,
- calculate_alignment(flags, align, size),
- flags, ctor, NULL, NULL);
+ s = create_cache(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}
EXPORT_SYMBOL(kmem_cache_create);
-static int do_kmem_cache_shutdown(struct kmem_cache *s,
+static int shutdown_cache(struct kmem_cache *s,
struct list_head *release, bool *need_rcu_barrier)
{
- if (__kmem_cache_shutdown(s) != 0) {
- printk(KERN_ERR "kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
- dump_stack();
+ if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
- }
if (s->flags & SLAB_DESTROY_BY_RCU)
*need_rcu_barrier = true;
-#ifdef CONFIG_MEMCG_KMEM
- if (!is_root_cache(s))
- list_del(&s->memcg_params.list);
-#endif
list_move(&s->list, release);
return 0;
}
-static void do_kmem_cache_release(struct list_head *release,
- bool need_rcu_barrier)
+static void release_caches(struct list_head *release, bool need_rcu_barrier)
{
struct kmem_cache *s, *s2;
if (!cache_name)
goto out_unlock;
- s = do_kmem_cache_create(cache_name, root_cache->object_size,
- root_cache->size, root_cache->align,
- root_cache->flags, root_cache->ctor,
- memcg, root_cache);
+ s = create_cache(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
/*
* If we could not create a memcg cache, do not complain, because
* that's not critical at all as we can always proceed with the root
put_online_cpus();
}
+static int __shutdown_memcg_cache(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ BUG_ON(is_root_cache(s));
+
+ if (shutdown_cache(s, release, need_rcu_barrier))
+ return -EBUSY;
+
+ list_del(&s->memcg_params.list);
+ return 0;
+}
+
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
LIST_HEAD(release);
* The cgroup is about to be freed and therefore has no charges
* left. Hence, all its caches must be empty by now.
*/
- BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+ BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
}
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
- do_kmem_cache_release(&release, need_rcu_barrier);
+ release_caches(&release, need_rcu_barrier);
+}
+
+static int shutdown_memcg_caches(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ struct memcg_cache_array *arr;
+ struct kmem_cache *c, *c2;
+ LIST_HEAD(busy);
+ int i;
+
+ BUG_ON(!is_root_cache(s));
+
+ /*
+ * First, shutdown active caches, i.e. caches that belong to online
+ * memory cgroups.
+ */
+ arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ for_each_memcg_cache_index(i) {
+ c = arr->entries[i];
+ if (!c)
+ continue;
+ if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+ /*
+ * The cache still has objects. Move it to a temporary
+ * list so as not to try to destroy it for a second
+ * time while iterating over inactive caches below.
+ */
+ list_move(&c->memcg_params.list, &busy);
+ else
+ /*
+ * The cache is empty and will be destroyed soon. Clear
+ * the pointer to it in the memcg_caches array so that
+ * it will never be accessed even if the root cache
+ * stays alive.
+ */
+ arr->entries[i] = NULL;
+ }
+
+ /*
+ * Second, shutdown all caches left from memory cgroups that are now
+ * offline.
+ */
+ list_for_each_entry_safe(c, c2, &s->memcg_params.list,
+ memcg_params.list)
+ __shutdown_memcg_cache(c, release, need_rcu_barrier);
+
+ list_splice(&busy, &s->memcg_params.list);
+
+ /*
+ * A cache being destroyed must be empty. In particular, this means
+ * that all per memcg caches attached to it must be empty too.
+ */
+ if (!list_empty(&s->memcg_params.list))
+ return -EBUSY;
+ return 0;
+}
+#else
+static inline int shutdown_memcg_caches(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ return 0;
}
#endif /* CONFIG_MEMCG_KMEM */
void kmem_cache_destroy(struct kmem_cache *s)
{
- struct kmem_cache *c, *c2;
LIST_HEAD(release);
bool need_rcu_barrier = false;
- bool busy = false;
+ int err;
if (unlikely(!s))
return;
- BUG_ON(!is_root_cache(s));
-
get_online_cpus();
get_online_mems();
if (s->refcount)
goto out_unlock;
- for_each_memcg_cache_safe(c, c2, s) {
- if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
- busy = true;
- }
-
- if (!busy)
- do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+ err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+ if (!err)
+ err = shutdown_cache(s, &release, &need_rcu_barrier);
+ if (err) {
+ pr_err("kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ }
out_unlock:
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
- do_kmem_cache_release(&release, need_rcu_barrier);
+ release_caches(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int slab_is_available(void)
+bool slab_is_available(void)
{
return slab_state >= UP;
}
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
/*
* Debug settings:
*/
-#ifdef CONFIG_SLUB_DEBUG_ON
+#if defined(CONFIG_SLUB_DEBUG_ON)
static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined(CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
#else
static int slub_debug;
#endif
return 0;
}
+/* Supports checking bulk free of a constructed freelist */
static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
unsigned long addr, unsigned long *flags)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
spin_lock_irqsave(&n->list_lock, *flags);
slab_lock(page);
if (!check_slab(s, page))
goto fail;
+next_object:
+ cnt++;
+
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
goto fail;
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
out:
+ if (cnt != bulk_cnt)
+ slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+ bulk_cnt, cnt);
+
slab_unlock(page);
/*
* Keep node_lock to preserve integrity
return flags;
}
-#else
+#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
struct page *page, void *object, unsigned long addr) { return 0; }
static inline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
unsigned long addr, unsigned long *flags) { return NULL; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
- might_sleep_if(flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(flags));
if (should_failslab(s->object_size, flags, s->flags))
return NULL;
kasan_slab_free(s, x);
}
+static inline void slab_free_freelist_hook(struct kmem_cache *s,
+ void *head, void *tail)
+{
+/*
+ * Compiler cannot detect this function can be removed if slab_free_hook()
+ * evaluates to nothing. Thus, catch all relevant config debug options here.
+ */
+#if defined(CONFIG_KMEMCHECK) || \
+ defined(CONFIG_LOCKDEP) || \
+ defined(CONFIG_DEBUG_KMEMLEAK) || \
+ defined(CONFIG_DEBUG_OBJECTS_FREE) || \
+ defined(CONFIG_KASAN)
+
+ void *object = head;
+ void *tail_obj = tail ? : head;
+
+ do {
+ slab_free_hook(s, object);
+ } while ((object != tail_obj) &&
+ (object = get_freepointer(s, object)));
+#endif
+}
+
static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
flags |= __GFP_NOTRACK;
- if (memcg_charge_slab(s, flags, order))
- return NULL;
-
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
else
page = __alloc_pages_node(node, flags, order);
- if (!page)
- memcg_uncharge_slab(s, order);
+ if (page && memcg_charge_slab(page, flags, order, s)) {
+ __free_pages(page, order);
+ page = NULL;
+ }
return page;
}
flags &= gfp_allowed_mask;
- if (flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(flags))
local_irq_enable();
flags |= s->allocflags;
* so we fall-back to the minimum order allocation.
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
+ if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
page->frozen = 1;
out:
- if (flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(flags))
local_irq_disable();
if (!page)
return NULL;
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_pages(page, order);
- memcg_uncharge_slab(s, order);
+ __free_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
VM_BUG_ON(s->reserved != sizeof(*head));
head = page_address(page) + offset;
} else {
- /*
- * RCU free overloads the RCU head over the LRU
- */
- head = (void *)&page->lru;
+ head = &page->rcu_head;
}
call_rcu(head, rcu_free_slab);
* And if we were unable to get a new slab from the partial slab lists then
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
*/
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
- unsigned long flags;
-
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
- /*
- * We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
- * pointer.
- */
- c = this_cpu_ptr(s->cpu_slab);
-#endif
page = c->page;
if (!page)
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
return freelist;
new_slab:
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
return NULL;
}
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
- local_irq_restore(flags);
return freelist;
}
+/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+ p = ___slab_alloc(s, gfpflags, node, addr, c);
+ local_irq_restore(flags);
+ return p;
+}
+
/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, unsigned long addr)
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+
{
void *prior;
- void **object = (void *)x;
int was_frozen;
struct page new;
unsigned long counters;
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, x, addr, &flags)))
+ !(n = free_debug_processing(s, page, head, tail, cnt,
+ addr, &flags)))
return;
do {
}
prior = page->freelist;
counters = page->counters;
- set_freepointer(s, object, prior);
+ set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
- new.inuse--;
+ new.inuse -= cnt;
if ((!new.inuse || !prior) && !was_frozen) {
if (kmem_cache_has_cpu_partial(s) && !prior) {
} while (!cmpxchg_double_slab(s, page,
prior, counters,
- object, new.counters,
+ head, new.counters,
"__slab_free"));
if (likely(!n)) {
*
* If fastpath is not possible then fall back to __slab_free where we deal
* with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
*/
-static __always_inline void slab_free(struct kmem_cache *s,
- struct page *page, void *x, unsigned long addr)
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
{
- void **object = (void *)x;
+ void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
- slab_free_hook(s, x);
+ slab_free_freelist_hook(s, head, tail);
redo:
/*
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, object, c->freelist);
+ set_freepointer(s, tail_obj, c->freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
- object, next_tid(tid)))) {
+ head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, x, addr);
+ __slab_free(s, page, head, tail_obj, cnt, addr);
}
s = cache_from_obj(s, x);
if (!s)
return;
- slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+ slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
- struct kmem_cache_cpu *c;
+struct detached_freelist {
struct page *page;
- int i;
+ void *tail;
+ void *freelist;
+ int cnt;
+};
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page. It builds a detached freelist directly within the given
+ * page/objects. This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive. Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
+{
+ size_t first_skipped_index = 0;
+ int lookahead = 3;
+ void *object;
- for (i = 0; i < size; i++) {
- void *object = p[i];
+ /* Always re-init detached_freelist */
+ df->page = NULL;
- BUG_ON(!object);
- /* kmem cache debug support */
- s = cache_from_obj(s, object);
- if (unlikely(!s))
- goto exit;
- slab_free_hook(s, object);
+ do {
+ object = p[--size];
+ } while (!object && size);
+
+ if (!object)
+ return 0;
- page = virt_to_head_page(object);
+ /* Start new detached freelist */
+ set_freepointer(s, object, NULL);
+ df->page = virt_to_head_page(object);
+ df->tail = object;
+ df->freelist = object;
+ p[size] = NULL; /* mark object processed */
+ df->cnt = 1;
+
+ while (size) {
+ object = p[--size];
+ if (!object)
+ continue; /* Skip processed objects */
+
+ /* df->page is always set at this point */
+ if (df->page == virt_to_head_page(object)) {
+ /* Opportunity build freelist */
+ set_freepointer(s, object, df->freelist);
+ df->freelist = object;
+ df->cnt++;
+ p[size] = NULL; /* mark object processed */
- if (c->page == page) {
- /* Fastpath: local CPU free */
- set_freepointer(s, object, c->freelist);
- c->freelist = object;
- } else {
- c->tid = next_tid(c->tid);
- local_irq_enable();
- /* Slowpath: overhead locked cmpxchg_double_slab */
- __slab_free(s, page, object, _RET_IP_);
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ continue;
}
+
+ /* Limit look ahead search */
+ if (!--lookahead)
+ break;
+
+ if (!first_skipped_index)
+ first_skipped_index = size + 1;
}
-exit:
- c->tid = next_tid(c->tid);
- local_irq_enable();
+
+ return first_skipped_index;
+}
+
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ if (WARN_ON(!size))
+ return;
+
+ do {
+ struct detached_freelist df;
+
+ size = build_detached_freelist(s, size, p, &df);
+ if (unlikely(!df.page))
+ continue;
+
+ slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ } while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
void *object = c->freelist;
if (unlikely(!object)) {
- local_irq_enable();
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
- p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
_RET_IP_, c);
- if (unlikely(!p[i])) {
- __kmem_cache_free_bulk(s, i, p);
- return false;
- }
- local_irq_disable();
+ if (unlikely(!p[i]))
+ goto error;
+
c = this_cpu_ptr(s->cpu_slab);
continue; /* goto for-loop */
}
/* kmem_cache debug support */
s = slab_pre_alloc_hook(s, flags);
- if (unlikely(!s)) {
- __kmem_cache_free_bulk(s, i, p);
- c->tid = next_tid(c->tid);
- local_irq_enable();
- return false;
- }
+ if (unlikely(!s))
+ goto error;
c->freelist = get_freepointer(s, object);
p[i] = object;
}
return true;
+
+error:
+ __kmem_cache_free_bulk(s, i, p);
+ local_irq_enable();
+ return false;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
- for (order = max(min_order,
- fls(min_objects * size - 1) - PAGE_SHIFT);
+ for (order = max(min_order, get_order(min_objects * size + reserved));
order <= max_order; order++) {
unsigned long slab_size = PAGE_SIZE << order;
- if (slab_size < min_objects * size + reserved)
- continue;
-
rem = (slab_size - reserved) % size;
if (rem <= slab_size / fract_leftover)
break;
-
}
return order;
* works by first attempting to generate a layout with
* the best configuration and backing off gradually.
*
- * First we reduce the acceptable waste in a slab. Then
+ * First we increase the acceptable waste in a slab. Then
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
__free_kmem_pages(page, compound_order(page));
return;
}
- slab_free(page->slab_cache, page, object, _RET_IP_);
+ slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != page->first_page, page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
- }
- return;
- }
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head_by_tail(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ __put_compound_page(page);
+ else
__put_single_page(page);
}
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
- unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
-
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
- }
- }
-
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
- }
- return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
*/
void mark_page_accessed(struct page *page)
{
+ page = compound_head(page);
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- put_compound_page(page);
- continue;
- }
-
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
zone = NULL;
}
+ page = compound_head(page);
if (!put_page_testzero(page))
continue;
+ if (PageCompound(page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
* deadlock in the swap out path.
*/
/*
- * Add it to the swap cache and mark it dirty
+ * Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- if (!err) { /* Success */
- SetPageDirty(page);
+ if (!err) {
return 1;
} else { /* -ENOMEM radix-tree allocation failure */
/*
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
+ /* The page is part of THP and cannot be reused */
+ if (PageTransCompound(page))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, true);
+ page_add_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
goto out_release_uncharge_unlock;
inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
out_release:
page_cache_release(page);
goto out;
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
}
EXPORT_SYMBOL(kstrndup);
+/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
/**
* kmemdup - duplicate region of memory
*
{
if (unlikely(offset + PAGE_ALIGN(len) < offset))
return -EINVAL;
- if (unlikely(offset & ~PAGE_MASK))
+ if (unlikely(offset_in_page(offset)))
return -EINVAL;
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
* Also handle the case where a kernel thread has adopted this mm via use_mm().
* That kernel thread's vmacache is not applicable to this mm.
*/
-static bool vmacache_valid_mm(struct mm_struct *mm)
+static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
return current->mm == mm && !(current->flags & PF_KTHREAD);
}
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include "internal.h"
+
struct vfree_deferred {
struct llist_head list;
struct work_struct wq;
struct vmap_area *first;
BUG_ON(!size);
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
va = kmalloc_node(sizeof(struct vmap_area),
void *vaddr = NULL;
unsigned int order;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
if (WARN_ON(size == 0)) {
/*
unsigned int order;
struct vmap_block *vb;
- BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
goto fail;
}
area->pages[i] = page;
- if (gfp_mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
while (count) {
unsigned long offset, length;
- offset = (unsigned long)addr & ~PAGE_MASK;
+ offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > count)
length = count;
while (count) {
unsigned long offset, length;
- offset = (unsigned long)addr & ~PAGE_MASK;
+ offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > count)
length = count;
bool purged = false;
/* verify parameters and allocate data structures */
- BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+ BUG_ON(offset_in_page(align) || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 2;
/*
* These thresholds are used when we account memory pressure through
static unsigned long zone_reclaimable_pages(struct zone *zone)
{
- int nr;
+ unsigned long nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
int referenced_ptes, referenced_page;
unsigned long vm_flags;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
&vm_flags);
referenced_page = TestClearPageReferenced(page);
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool freeable = false;
cond_resched();
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
+ freeable = true;
may_enter_fs = 1;
-
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page,
- ttu_flags|TTU_BATCH_FLUSH)) {
+ switch (try_to_unmap(page, freeable ?
+ ttu_flags | TTU_BATCH_FLUSH | TTU_FREE :
+ ttu_flags | TTU_BATCH_FLUSH)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
+ if (freeable && !PageDirty(page))
+ count_vm_event(PGLAZYFREED);
+
nr_reclaimed++;
/*
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
return isolated > inactive;
}
#ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- if (inactive * zone->inactive_ratio < active)
- return 1;
-
- return 0;
+ return inactive * zone->inactive_ratio < active;
}
/**
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
- return 0;
+ return false;
if (!mem_cgroup_disabled())
return mem_cgroup_inactive_anon_is_low(lruvec);
return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
#else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
{
- return 0;
+ return false;
}
#endif
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long inactive;
unsigned long active;
return active > inactive;
}
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
if (is_file_lru(lru))
return inactive_file_is_low(lruvec);
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
/*
* If compaction is deferred, reclaim up to a point where
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx, 0))
+ balance_gap, classzone_idx))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
{
- long nr_pagecache_reclaimable;
- long delta = 0;
+ unsigned long nr_pagecache_reclaimable;
+ unsigned long delta = 0;
/*
* If RECLAIM_UNMAP is set, then all file pages are considered
/*
* Do not scan if the allocation should not be delayed.
*/
- if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*
else
__inc_zone_state(z, NUMA_OTHER);
}
+
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(int node, enum zone_stat_item item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+
+ return
+#ifdef CONFIG_ZONE_DMA
+ zone_page_state(&zones[ZONE_DMA], item) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+ zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+ zone_page_state(&zones[ZONE_NORMAL], item) +
+ zone_page_state(&zones[ZONE_MOVABLE], item);
+}
+
#endif
#ifdef CONFIG_COMPACTION
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
- "thp_split",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
"Unmovable",
"Reclaimable",
"Movable",
- "Reserve",
+ "HighAtomic",
#ifdef CONFIG_CMA
"CMA",
#endif
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(char *name, gfp_t gfp,
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
#include <linux/zpool.h>
struct zpool {
- char *type;
-
struct zpool_driver *driver;
void *pool;
const struct zpool_ops *ops;
}
EXPORT_SYMBOL(zpool_unregister_driver);
-static struct zpool_driver *zpool_get_driver(char *type)
+/* this assumes @type is null-terminated. */
+static struct zpool_driver *zpool_get_driver(const char *type)
{
struct zpool_driver *driver;
* not be loaded, and calling @zpool_create_pool() with the pool type will
* fail.
*
+ * The @type string must be null-terminated.
+ *
* Returns: true if @type pool is available, false if not
*/
bool zpool_has_pool(char *type)
*
* Implementations must guarantee this to be thread-safe.
*
+ * The @type and @name strings must be null-terminated.
+ *
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
const struct zpool_ops *ops)
{
struct zpool_driver *driver;
return NULL;
}
- zpool->type = driver->type;
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
*/
void zpool_destroy_pool(struct zpool *zpool)
{
- pr_debug("destroying pool type %s\n", zpool->type);
+ pr_debug("destroying pool type %s\n", zpool->driver->type);
spin_lock(&pools_lock);
list_del(&zpool->list);
*
* Returns: The type of zpool.
*/
-char *zpool_get_type(struct zpool *zpool)
+const char *zpool_get_type(struct zpool *zpool)
{
- return zpool->type;
+ return zpool->driver->type;
}
/**
* struct page(s) to form a zspage.
*
* Usage of struct page fields:
- * page->first_page: points to the first component (0-order) page
+ * page->private: points to the first component (0-order) page
* page->index (union with page->freelist): offset of the first object
* starting in this page. For the first page, this is
* always 0, so we use this field (aka freelist) to point
*
* For _first_ page only:
*
- * page->private (union with page->first_page): refers to the
- * component page after the first page
+ * page->private: refers to the component page after the first page
* If the page is first_page for huge object, it stores handle.
* Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* page->lru: links together first pages of various zspages.
* Basically forming list of zspages in a fullness group.
* page->mapping: class index and fullness group of the zspage
+ * page->inuse: the number of objects that are used in this zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/vmalloc.h>
-#include <linux/hardirq.h>
+#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/debugfs.h>
OBJ_USED,
CLASS_ALMOST_FULL,
CLASS_ALMOST_EMPTY,
- NR_ZS_STAT_TYPE,
};
+#ifdef CONFIG_ZSMALLOC_STAT
+#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
+#else
+#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
+#endif
+
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
};
struct zs_pool {
- char *name;
+ const char *name;
struct size_class **size_class;
struct kmem_cache *handle_cachep;
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp,
+static void *zs_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- class->stats.objs[type] += cnt;
+ if (type < NR_ZS_STAT_TYPE)
+ class->stats.objs[type] += cnt;
}
static inline void zs_stat_dec(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- class->stats.objs[type] -= cnt;
+ if (type < NR_ZS_STAT_TYPE)
+ class->stats.objs[type] -= cnt;
}
static inline unsigned long zs_stat_get(struct size_class *class,
enum zs_stat_type type)
{
- return class->stats.objs[type];
+ if (type < NR_ZS_STAT_TYPE)
+ return class->stats.objs[type];
+ return 0;
}
#ifdef CONFIG_ZSMALLOC_STAT
.release = single_release,
};
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
{
struct dentry *entry;
{
}
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
{
return 0;
}
if (is_first_page(page))
return page;
else
- return page->first_page;
+ return (struct page *)page_private(page);
}
static struct page *get_next_page(struct page *page)
{
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
- return *(unsigned long *)page_private(page);
+ return page_private(page);
} else
return *(unsigned long *)obj;
}
* Allocate individual pages and link them together as:
* 1. first page->private = first sub-page
* 2. all sub-pages are linked together using page->lru
- * 3. each sub-page is linked to the first page using page->first_page
+ * 3. each sub-page is linked to the first page using page->private
*
* For each size class, First/Head pages are linked together using
* page->lru. Also, we set PG_private to identify the first page
if (i == 1)
set_page_private(first_page, (unsigned long)page);
if (i >= 1)
- page->first_page = first_page;
+ set_page_private(page, (unsigned long)first_page);
if (i >= 2)
list_add(&page->lru, &prev_page->lru);
if (i == class->pages_per_zspage - 1) /* last page */
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
- int class_idx;
- enum fullness_group fullness;
BUG_ON(!obj);
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
- get_zspage_mapping(first_page, &class_idx, &fullness);
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
vaddr = kmap_atomic(f_page);
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- if (!pool->shrinker_enabled)
- return 0;
-
for (i = zs_size_classes - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
{
int i;
struct zs_pool *pool;
/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
-static struct kparam_string zswap_compressor_kparam = {
- .string = zswap_compressor,
- .maxlen = sizeof(zswap_compressor),
-};
+static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_compressor_param_ops = {
.set = zswap_compressor_param_set,
- .get = param_get_string,
+ .get = param_get_charp,
+ .free = param_free_charp,
};
module_param_cb(compressor, &zswap_compressor_param_ops,
- &zswap_compressor_kparam, 0644);
+ &zswap_compressor, 0644);
/* Compressed storage zpool to use */
#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
-static struct kparam_string zswap_zpool_kparam = {
- .string = zswap_zpool_type,
- .maxlen = sizeof(zswap_zpool_type),
-};
+static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
static struct kernel_param_ops zswap_zpool_param_ops = {
- .set = zswap_zpool_param_set,
- .get = param_get_string,
+ .set = zswap_zpool_param_set,
+ .get = param_get_charp,
+ .free = param_free_charp,
};
-module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;
static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
pgoff_t offset)
{
- struct zswap_entry *entry = NULL;
+ struct zswap_entry *entry;
entry = zswap_rb_search(root, offset);
if (entry)
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
return NULL;
}
-static struct zswap_pool *__zswap_pool_create_fallback(void)
+static __init struct zswap_pool *__zswap_pool_create_fallback(void)
{
if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+ if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+ pr_err("default compressor %s not available\n",
+ zswap_compressor);
+ return NULL;
+ }
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
- strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
- sizeof(zswap_compressor));
+ param_free_charp(&zswap_compressor);
+ zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
}
if (!zpool_has_pool(zswap_zpool_type)) {
+ if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+ pr_err("default zpool %s not available\n",
+ zswap_zpool_type);
+ return NULL;
+ }
pr_err("zpool %s not available, using default %s\n",
zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
- strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
- sizeof(zswap_zpool_type));
+ param_free_charp(&zswap_zpool_type);
+ zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
}
return zswap_pool_create(zswap_zpool_type, zswap_compressor);
* param callbacks
**********************************/
+/* val must be a null-terminated string */
static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *type, char *compressor)
{
struct zswap_pool *pool, *put_pool = NULL;
- char str[kp->str->maxlen], *s;
+ char *s = strstrip((char *)val);
int ret;
- /*
- * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
- * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
- * 32 (arbitrary).
- */
- strlcpy(str, val, kp->str->maxlen);
- s = strim(str);
+ /* no change required */
+ if (!strcmp(s, *(char **)kp->arg))
+ return 0;
/* if this is load-time (pre-init) param setting,
* don't create a pool; that's done during init.
*/
if (!zswap_init_started)
- return param_set_copystring(s, kp);
-
- /* no change required */
- if (!strncmp(kp->str->string, s, kp->str->maxlen))
- return 0;
+ return param_set_charp(s, kp);
if (!type) {
- type = s;
- if (!zpool_has_pool(type)) {
- pr_err("zpool %s not available\n", type);
+ if (!zpool_has_pool(s)) {
+ pr_err("zpool %s not available\n", s);
return -ENOENT;
}
+ type = s;
} else if (!compressor) {
- compressor = s;
- if (!crypto_has_comp(compressor, 0, 0)) {
- pr_err("compressor %s not available\n", compressor);
+ if (!crypto_has_comp(s, 0, 0)) {
+ pr_err("compressor %s not available\n", s);
return -ENOENT;
}
+ compressor = s;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
}
spin_lock(&zswap_pools_lock);
}
if (pool)
- ret = param_set_copystring(s, kp);
+ ret = param_set_charp(s, kp);
else
ret = -EINVAL;
/* store */
len = dlen + sizeof(struct zswap_header);
ret = zpool_malloc(entry->pool->zpool, len,
- __GFP_NORETRY | __GFP_NOWARN, &handle);
+ __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
+ &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
len += NET_SKB_PAD;
if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
- (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
len += NET_SKB_PAD + NET_IP_ALIGN;
if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
- (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
return NULL;
gfp_head = gfp_mask;
- if (gfp_head & __GFP_WAIT)
+ if (gfp_head & __GFP_DIRECT_RECLAIM)
gfp_head |= __GFP_REPEAT;
*errcode = -ENOBUFS;
while (order) {
if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP |
__GFP_NOWARN |
__GFP_NORETRY,
pfrag->offset = 0;
if (SKB_FRAG_PAGE_ORDER) {
- pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY,
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
consume_skb(info.skb2);
if (info.delivered) {
- if (info.congested && (allocation & __GFP_WAIT))
+ if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
const struct sk_buff *skb)
{
struct flow_stats *stats;
- int node = numa_node_id();
+ int node = numa_mem_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
stats = rcu_dereference(flow->stats[node]);
gfp_t slab_mask = GFP_NOWAIT;
gfp_t page_mask = GFP_NOWAIT;
- if (gfp & __GFP_WAIT) {
+ if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
page_mask = GFP_HIGHUSER;
}
struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
- bool can_wait = !!(gfp & __GFP_WAIT);
+ bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
u32 pos;
/* the goal here is to just make sure that someone, somewhere
if (bundle->num_conns >= 20) {
_debug("too many conns");
- if (!(gfp & __GFP_WAIT)) {
+ if (!gfpflags_allow_blocking(gfp)) {
_leave(" = -EAGAIN");
return -EAGAIN;
}
/* Set an association id for a given association */
int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
{
- bool preload = !!(gfp & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp);
int ret;
/* If the id is already assigned, keep it. */
"Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr);
}
+# Check if the commit log is in a possible stack dump
+ if ($in_commit_log && !$commit_log_possible_stack_dump &&
+ ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+ $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+ # timestamp
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+ # stack dump address
+ $commit_log_possible_stack_dump = 1;
+ }
+
# Check for line lengths > 75 in commit log, warn once
if ($in_commit_log && !$commit_log_long_line &&
- length($line) > 75 &&
- !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
- # file delta changes
- $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
- # filename then :
- $line =~ /^\s*(?:Fixes:|Link:)/i ||
- # A Fixes: or Link: line
- $commit_log_possible_stack_dump)) {
+ length($line) > 75 &&
+ !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+ # file delta changes
+ $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+ # filename then :
+ $line =~ /^\s*(?:Fixes:|Link:)/i ||
+ # A Fixes: or Link: line
+ $commit_log_possible_stack_dump)) {
WARN("COMMIT_LOG_LONG_LINE",
"Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
$commit_log_long_line = 1;
}
-# Check if the commit log is in a possible stack dump
- if ($in_commit_log && !$commit_log_possible_stack_dump &&
- ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
- $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
- # timestamp
- $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
- # stack dump address
- $commit_log_possible_stack_dump = 1;
- }
-
# Reset possible stack dump if a blank line is found
- if ($in_commit_log && $commit_log_possible_stack_dump &&
- $line =~ /^\s*$/) {
- $commit_log_possible_stack_dump = 0;
- }
+ if ($in_commit_log && $commit_log_possible_stack_dump &&
+ $line =~ /^\s*$/) {
+ $commit_log_possible_stack_dump = 0;
+ }
# Check for git id commit length and improperly formed commit descriptions
- if ($in_commit_log &&
+ if ($in_commit_log && !$commit_log_possible_stack_dump &&
($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
- ($line =~ /\b[0-9a-f]{12,40}\b/i &&
- $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+ ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+ $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&
+ $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
my $init_char = "c";
my $orig_commit = "";
my $short = 1;
{
void *ptr;
int order = ima_maxorder;
- gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY;
if (order)
order = min(get_order(max_size), order);
TARGETS += ftrace
TARGETS += futex
TARGETS += kcmp
+TARGETS += lib
TARGETS += membarrier
TARGETS += memfd
TARGETS += memory-hotplug
--- /dev/null
+# Makefile for lib/ function selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := printf.sh
+
+include ../lib.mk
--- /dev/null
+#!/bin/sh
+# Runs printf infrastructure using test_printf kernel module
+
+if /sbin/modprobe -q test_printf; then
+ /sbin/modprobe -q -r test_printf
+ echo "printf: ok"
+else
+ echo "printf: [FAIL]"
+ exit 1
+fi
BINARIES += hugepage-mmap
BINARIES += hugepage-shm
BINARIES += map_hugetlb
+BINARIES += mlock2-tests
+BINARIES += on-fault-limit
BINARIES += thuge-gen
BINARIES += transhuge-stress
BINARIES += userfaultfd
--- /dev/null
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <syscall.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ char *end_addr;
+ char *stop;
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ end_addr = strchr(line, '-');
+ if (!end_addr) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ *end_addr = '\0';
+ end_addr++;
+ stop = strchr(end_addr, ' ');
+ if (!stop) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ stop = '\0';
+
+ sscanf(line, "%lx", &start);
+ sscanf(end_addr, "%lx", &end);
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+static uint64_t get_pageflags(unsigned long addr)
+{
+ FILE *file;
+ uint64_t pfn;
+ unsigned long offset;
+
+ file = fopen("/proc/self/pagemap", "r");
+ if (!file) {
+ perror("fopen pagemap");
+ _exit(1);
+ }
+
+ offset = addr / getpagesize() * sizeof(pfn);
+
+ if (fseek(file, offset, SEEK_SET)) {
+ perror("fseek pagemap");
+ _exit(1);
+ }
+
+ if (fread(&pfn, sizeof(pfn), 1, file) != 1) {
+ perror("fread pagemap");
+ _exit(1);
+ }
+
+ fclose(file);
+ return pfn;
+}
+
+static uint64_t get_kpageflags(unsigned long pfn)
+{
+ uint64_t flags;
+ FILE *file;
+
+ file = fopen("/proc/kpageflags", "r");
+ if (!file) {
+ perror("fopen kpageflags");
+ _exit(1);
+ }
+
+ if (fseek(file, pfn * sizeof(flags), SEEK_SET)) {
+ perror("fseek kpageflags");
+ _exit(1);
+ }
+
+ if (fread(&flags, sizeof(flags), 1, file) != 1) {
+ perror("fread kpageflags");
+ _exit(1);
+ }
+
+ fclose(file);
+ return flags;
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end)
+ goto out;
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+ fclose(file);
+ file = NULL;
+
+out:
+ free(line);
+ return file;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+ FILE *smaps;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ fclose(smaps);
+ return ret;
+}
+
+#define SIZE "Size:"
+#define RSS "Rss:"
+#define LOCKED "lo"
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+ bool ret = false;
+ bool locked;
+ FILE *smaps = NULL;
+ unsigned long vma_size, vma_rss;
+ char *line = NULL;
+ char *value;
+ size_t size = 0;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ goto out;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, SIZE)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ value = line + strlen(SIZE);
+ if (sscanf(value, "%lu kB", &vma_size) < 1) {
+ printf("Unable to parse smaps entry for Size\n");
+ goto out;
+ }
+ break;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, RSS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ value = line + strlen(RSS);
+ if (sscanf(value, "%lu kB", &vma_rss) < 1) {
+ printf("Unable to parse smaps entry for Rss\n");
+ goto out;
+ }
+ break;
+ }
+
+ ret = locked && (vma_rss < vma_size);
+out:
+ free(line);
+ if (smaps)
+ fclose(smaps);
+ return ret;
+}
+
+#define PRESENT_BIT 0x8000000000000000
+#define PFN_MASK 0x007FFFFFFFFFFFFF
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Both pages should be present */
+ if (((page1_flags & PRESENT_BIT) == 0) ||
+ ((page2_flags & PRESENT_BIT) == 0)) {
+ printf("Failed to make both pages present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Both pages should be unevictable */
+ if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
+ ((page2_flags & UNEVICTABLE_BIT) == 0)) {
+ printf("Failed to make both pages unevictable\n");
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map, LOCKED)) {
+ printf("VMA flag %s is missing on page 1\n", LOCKED);
+ return 1;
+ }
+
+ if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is missing on page 2\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_lock_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) {
+ printf("A page is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map, LOCKED)) {
+ printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+ return 1;
+ }
+
+ if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+ printf("VMA flag %s is present on page 2 after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_lock()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, 0)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(0)");
+ goto unmap;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Neither page should be present */
+ if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) {
+ printf("Pages were made present by MLOCK_ONFAULT\n");
+ return 1;
+ }
+
+ *map = 'a';
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+
+ /* Only page 1 should be present */
+ if ((page1_flags & PRESENT_BIT) == 0) {
+ printf("Page 1 is not present after fault\n");
+ return 1;
+ } else if (page2_flags & PRESENT_BIT) {
+ printf("Page 2 was made present\n");
+ return 1;
+ }
+
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make faulted page unevictable\n");
+ return 1;
+ }
+
+ if (!is_vma_lock_on_fault((unsigned long)map)) {
+ printf("VMA is not marked for lock on fault\n");
+ return 1;
+ }
+
+ if (!is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA is not marked for lock on fault\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags;
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+ if (page1_flags & UNEVICTABLE_BIT) {
+ printf("Page 1 is still marked unevictable after unlock\n");
+ return 1;
+ }
+
+ if (is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA is still lock on fault after unlock\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_onfault()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_onfault_check(map);
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+ uint64_t page1_flags, page2_flags;
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ page1_flags = get_pageflags((unsigned long)map);
+ page2_flags = get_pageflags((unsigned long)map + page_size);
+ page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+ page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+ /* Page 1 should be unevictable */
+ if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+ printf("Failed to make present page unevictable\n");
+ goto unmap;
+ }
+
+ if (!is_vma_lock_on_fault((unsigned long)map) ||
+ !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA with present pages is not marked lock on fault\n");
+ goto unmap;
+ }
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_munlockall()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT)) {
+ perror("mlockall(MCL_CURRENT)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_lock_check(map))
+ goto unmap;
+
+ munmap(map, 2 * page_size);
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall second mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+ perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_onfault_check(map))
+ goto unmap;
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+ goto out;
+ }
+
+ if (lock_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ munlockall();
+ return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(0);
+ }
+ perror("mlock(ONFAULT)\n");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("VMAs are not merged to start, aborting test\n");
+ ret = 0;
+ goto out;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ printf("failed to split VMA for munlock\n");
+ goto out;
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("failed to merge VMAs after munlock\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+ int ret = 1;
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ ret = test_function(false);
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+ ret += test_mlock_lock();
+ ret += test_mlock_onfault();
+ ret += test_munlockall();
+ ret += test_lock_onfault_of_present();
+ ret += test_vma_management(true);
+ ret += test_mlockall(test_vma_management);
+ return ret;
+}
+
--- /dev/null
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+ int ret = 1;
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+ perror("getrlimit");
+ return ret;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0);
+ if (map != MAP_FAILED)
+ printf("mmap should have failed, but didn't\n");
+ else {
+ ret = 0;
+ munmap(map, 2 * lims.rlim_max);
+ }
+
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+
+ ret += test_limit();
+ return ret;
+}
echo "[PASS]"
fi
+echo "--------------------"
+echo "running on-fault-limit"
+echo "--------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
exit $exitcode
--- /dev/null
+#!/bin/sh
+
+# Sergey Senozhatsky, 2015
+# sergey.senozhatsky.work@gmail.com
+#
+# This software is licensed under the terms of the GNU General Public
+# License version 2, as published by the Free Software Foundation, and
+# may be copied, distributed, and modified under those terms.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+
+# This program is intended to plot a `slabinfo -X' stats, collected,
+# for example, using the following command:
+# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
+#
+# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
+# and generate graphs (totals, slabs sorted by size, slabs sorted
+# by size).
+#
+# Graphs can be [individually] regenerate with different ranges and
+# size (-r %d,%d and -s %d,%d options).
+#
+# To visually compare N `totals' graphs, do
+# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
+#
+
+min_slab_name_size=11
+xmin=0
+xmax=0
+width=1500
+height=700
+mode=preprocess
+
+usage()
+{
+ echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
+ echo "FILEs must contain 'slabinfo -X' samples"
+ echo "-t - plot totals for FILE(s)"
+ echo "-l - plot slabs stats for FILE(s)"
+ echo "-s %d,%d - set image width and height"
+ echo "-r %d,%d - use data samples from a given range"
+}
+
+check_file_exist()
+{
+ if [ ! -f "$1" ]; then
+ echo "File '$1' does not exist"
+ exit 1
+ fi
+}
+
+do_slabs_plotting()
+{
+ local file=$1
+ local out_file
+ local range="every ::$xmin"
+ local xtic=""
+ local xtic_rotate="norotate"
+ local lines=2000000
+ local wc_lines
+
+ check_file_exist "$file"
+
+ out_file=`basename "$file"`
+ if [ $xmax -ne 0 ]; then
+ range="$range::$xmax"
+ lines=$((xmax-xmin))
+ fi
+
+ wc_lines=`cat "$file" | wc -l`
+ if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
+ wc_lines=$lines
+ fi
+
+ if [ "$wc_lines" -lt "$lines" ]; then
+ lines=$wc_lines
+ fi
+
+ if [ $((width / lines)) -gt $min_slab_name_size ]; then
+ xtic=":xtic(1)"
+ xtic_rotate=90
+ fi
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set output '$out_file.png'
+set autoscale xy
+set xlabel 'samples'
+set ylabel 'bytes'
+set style histogram columnstacked title textcolor lt -1
+set style fill solid 0.15
+set xtics rotate $xtic_rotate
+set key left above Left title reverse
+
+plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
+ '' $range u 3 title 'LOSS' with boxes
+EOF
+
+ if [ $? -eq 0 ]; then
+ echo "$out_file.png"
+ fi
+}
+
+do_totals_plotting()
+{
+ local gnuplot_cmd=""
+ local range="every ::$xmin"
+ local file=""
+
+ if [ $xmax -ne 0 ]; then
+ range="$range::$xmax"
+ fi
+
+ for i in "${t_files[@]}"; do
+ check_file_exist "$i"
+
+ file="$file"`basename "$i"`
+ gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
+ '$i Memory usage' with lines,"
+ gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
+ '$i Loss' with lines,"
+ done
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set autoscale xy
+set output '$file.png'
+set xlabel 'samples'
+set ylabel 'bytes'
+set key left above Left title reverse
+
+plot $gnuplot_cmd
+EOF
+
+ if [ $? -eq 0 ]; then
+ echo "$file.png"
+ fi
+}
+
+do_preprocess()
+{
+ local out
+ local lines
+ local in=$1
+
+ check_file_exist "$in"
+
+ # use only 'TOP' slab (biggest memory usage or loss)
+ let lines=3
+ out=`basename "$in"`"-slabs-by-loss"
+ `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
+ egrep -iv '\-\-|Name|Slabs'\
+ | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
+ if [ $? -eq 0 ]; then
+ do_slabs_plotting "$out"
+ fi
+
+ let lines=3
+ out=`basename "$in"`"-slabs-by-size"
+ `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
+ egrep -iv '\-\-|Name|Slabs'\
+ | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
+ if [ $? -eq 0 ]; then
+ do_slabs_plotting "$out"
+ fi
+
+ out=`basename "$in"`"-totals"
+ `cat "$in" | grep "Memory used" |\
+ awk '{print $3" "$7}' > "$out"`
+ if [ $? -eq 0 ]; then
+ t_files[0]=$out
+ do_totals_plotting
+ fi
+}
+
+parse_opts()
+{
+ local opt
+
+ while getopts "tlr::s::h" opt; do
+ case $opt in
+ t)
+ mode=totals
+ ;;
+ l)
+ mode=slabs
+ ;;
+ s)
+ array=(${OPTARG//,/ })
+ width=${array[0]}
+ height=${array[1]}
+ ;;
+ r)
+ array=(${OPTARG//,/ })
+ xmin=${array[0]}
+ xmax=${array[1]}
+ ;;
+ h)
+ usage
+ exit 0
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ exit 1
+ ;;
+ :)
+ echo "-$OPTARG requires an argument." >&2
+ exit 1
+ ;;
+ esac
+ done
+
+ return $OPTIND
+}
+
+parse_args()
+{
+ local idx=0
+ local p
+
+ for p in "$@"; do
+ case $mode in
+ preprocess)
+ files[$idx]=$p
+ idx=$idx+1
+ ;;
+ totals)
+ t_files[$idx]=$p
+ idx=$idx+1
+ ;;
+ slabs)
+ files[$idx]=$p
+ idx=$idx+1
+ ;;
+ esac
+ done
+}
+
+parse_opts "$@"
+argstart=$?
+parse_args "${@:$argstart}"
+
+if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
+ usage
+ exit 1
+fi
+
+case $mode in
+ preprocess)
+ for i in "${files[@]}"; do
+ do_preprocess "$i"
+ done
+ ;;
+ totals)
+ do_totals_plotting
+ ;;
+ slabs)
+ for i in "${files[@]}"; do
+ do_slabs_plotting "$i"
+ done
+ ;;
+ *)
+ echo "Unknown mode $mode" >&2
+ usage
+ exit 1
+ ;;
+esac
struct slabinfo *slab;
} aliasinfo[MAX_ALIASES];
-int slabs = 0;
-int actual_slabs = 0;
-int aliases = 0;
-int alias_targets = 0;
-int highest_node = 0;
+int slabs;
+int actual_slabs;
+int aliases;
+int alias_targets;
+int highest_node;
char buffer[4096];
-int show_empty = 0;
-int show_report = 0;
-int show_alias = 0;
-int show_slab = 0;
+int show_empty;
+int show_report;
+int show_alias;
+int show_slab;
int skip_zero = 1;
-int show_numa = 0;
-int show_track = 0;
-int show_first_alias = 0;
-int validate = 0;
-int shrink = 0;
-int show_inverted = 0;
-int show_single_ref = 0;
-int show_totals = 0;
-int sort_size = 0;
-int sort_active = 0;
-int set_debug = 0;
-int show_ops = 0;
-int show_activity = 0;
+int show_numa;
+int show_track;
+int show_first_alias;
+int validate;
+int shrink;
+int show_inverted;
+int show_single_ref;
+int show_totals;
+int sort_size;
+int sort_active;
+int set_debug;
+int show_ops;
+int show_activity;
+int output_lines = -1;
+int sort_loss;
+int extended_totals;
+int show_bytes;
/* Debug options */
-int sanity = 0;
-int redzone = 0;
-int poison = 0;
-int tracking = 0;
-int tracing = 0;
+int sanity;
+int redzone;
+int poison;
+int tracking;
+int tracing;
int page_size;
"-v|--validate Validate slabs\n"
"-z|--zero Include empty slabs\n"
"-1|--1ref Single reference\n"
+ "-N|--lines=K Show the first K slabs\n"
+ "-L|--Loss Sort by loss\n"
+ "-X|--Xtotals Show extended summary information\n"
+ "-B|--Bytes Show size in bytes\n"
"\nValid debug options (FZPUT may be combined)\n"
"a / A Switch on all debug options (=FZUP)\n"
"- Switch off all debug options\n"
char trailer = 0;
int n;
- if (value > 1000000000UL) {
- divisor = 100000000UL;
- trailer = 'G';
- } else if (value > 1000000UL) {
- divisor = 100000UL;
- trailer = 'M';
- } else if (value > 1000UL) {
- divisor = 100;
- trailer = 'K';
+ if (!show_bytes) {
+ if (value > 1000000000UL) {
+ divisor = 100000000UL;
+ trailer = 'G';
+ } else if (value > 1000000UL) {
+ divisor = 100000UL;
+ trailer = 'M';
+ } else if (value > 1000UL) {
+ divisor = 100;
+ trailer = 'K';
+ }
}
value /= divisor;
static void first_line(void)
{
if (show_activity)
- printf("Name Objects Alloc Free %%Fast Fallb O CmpX UL\n");
+ printf("Name Objects Alloc Free"
+ " %%Fast Fallb O CmpX UL\n");
else
- printf("Name Objects Objsize Space "
- "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+ printf("Name Objects Objsize %s "
+ "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n",
+ sort_loss ? " Loss" : "Space");
}
/*
s->alloc_slowpath + s->free_slowpath;
}
+static unsigned long slab_waste(struct slabinfo *s)
+{
+ return slab_size(s) - s->objects * s->object_size;
+}
+
static void slab_numa(struct slabinfo *s, int mode)
{
int node;
if (strcmp(s->name, "*") == 0)
return;
- printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n",
+ printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n",
s->name, s->aliases, s->order, s->objects);
if (s->hwcache_align)
printf("** Hardware cacheline aligned\n");
if (show_empty && s->slabs)
return;
- store_size(size_str, slab_size(s));
+ if (sort_loss == 0)
+ store_size(size_str, slab_size(s));
+ else
+ store_size(size_str, slab_waste(s));
snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
s->partial, s->cpu_slabs);
total_free ? (s->free_fastpath * 100 / total_free) : 0,
s->order_fallback, s->order, s->cmpxchg_double_fail,
s->cmpxchg_double_cpu_fail);
- }
- else
- printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
+ } else {
+ printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
s->name, s->objects, s->object_size, size_str, dist_str,
s->objs_per_slab, s->order,
s->slabs ? (s->partial * 100) / s->slabs : 100,
s->slabs ? (s->objects * s->object_size * 100) /
(s->slabs * (page_size << s->order)) : 100,
flags);
+ }
}
/*
printf("Slabcache Totals\n");
printf("----------------\n");
- printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n",
+ printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n",
slabs, aliases, alias_targets, used_slabs);
store_size(b1, total_size);store_size(b2, total_waste);
store_size(b3, total_waste * 100 / total_used);
- printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3);
+ printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3);
store_size(b1, total_objects);store_size(b2, total_partobj);
store_size(b3, total_partobj * 100 / total_objects);
- printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3);
+ printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3);
printf("\n");
- printf("Per Cache Average Min Max Total\n");
- printf("---------------------------------------------------------\n");
+ printf("Per Cache Average "
+ "Min Max Total\n");
+ printf("---------------------------------------"
+ "-------------------------------------\n");
store_size(b1, avg_objects);store_size(b2, min_objects);
store_size(b3, max_objects);store_size(b4, total_objects);
- printf("#Objects %10s %10s %10s %10s\n",
+ printf("#Objects %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_slabs);store_size(b2, min_slabs);
store_size(b3, max_slabs);store_size(b4, total_slabs);
- printf("#Slabs %10s %10s %10s %10s\n",
+ printf("#Slabs %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_partial);store_size(b2, min_partial);
store_size(b3, max_partial);store_size(b4, total_partial);
- printf("#PartSlab %10s %10s %10s %10s\n",
+ printf("#PartSlab %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_ppart);store_size(b2, min_ppart);
store_size(b3, max_ppart);
store_size(b4, total_partial * 100 / total_slabs);
- printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n",
+ printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
b1, b2, b3, b4);
store_size(b1, avg_partobj);store_size(b2, min_partobj);
store_size(b3, max_partobj);
store_size(b4, total_partobj);
- printf("PartObjs %10s %10s %10s %10s\n",
+ printf("PartObjs %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
store_size(b3, max_ppartobj);
store_size(b4, total_partobj * 100 / total_objects);
- printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n",
+ printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
b1, b2, b3, b4);
store_size(b1, avg_size);store_size(b2, min_size);
store_size(b3, max_size);store_size(b4, total_size);
- printf("Memory %10s %10s %10s %10s\n",
+ printf("Memory %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_used);store_size(b2, min_used);
store_size(b3, max_used);store_size(b4, total_used);
- printf("Used %10s %10s %10s %10s\n",
+ printf("Used %15s %15s %15s %15s\n",
b1, b2, b3, b4);
store_size(b1, avg_waste);store_size(b2, min_waste);
store_size(b3, max_waste);store_size(b4, total_waste);
- printf("Loss %10s %10s %10s %10s\n",
+ printf("Loss %15s %15s %15s %15s\n",
b1, b2, b3, b4);
printf("\n");
- printf("Per Object Average Min Max\n");
- printf("---------------------------------------------\n");
+ printf("Per Object Average "
+ "Min Max\n");
+ printf("---------------------------------------"
+ "--------------------\n");
store_size(b1, avg_memobj);store_size(b2, min_memobj);
store_size(b3, max_memobj);
- printf("Memory %10s %10s %10s\n",
+ printf("Memory %15s %15s %15s\n",
b1, b2, b3);
store_size(b1, avg_objsize);store_size(b2, min_objsize);
store_size(b3, max_objsize);
- printf("User %10s %10s %10s\n",
+ printf("User %15s %15s %15s\n",
b1, b2, b3);
store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
store_size(b3, max_objwaste);
- printf("Loss %10s %10s %10s\n",
+ printf("Loss %15s %15s %15s\n",
b1, b2, b3);
}
result = slab_size(s1) < slab_size(s2);
else if (sort_active)
result = slab_activity(s1) < slab_activity(s2);
+ else if (sort_loss)
+ result = slab_waste(s1) < slab_waste(s2);
else
result = strcasecmp(s1->name, s2->name);
active = a->slab->name;
}
else
- printf("%-20s -> %s\n", a->name, a->slab->name);
+ printf("%-15s -> %s\n", a->name, a->slab->name);
}
if (active)
printf("\n");
static void output_slabs(void)
{
struct slabinfo *slab;
+ int lines = output_lines;
- for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
+ for (slab = slabinfo; (slab < slabinfo + slabs) &&
+ lines != 0; slab++) {
if (slab->alias)
continue;
+ if (lines != -1)
+ lines--;
if (show_numa)
slab_numa(slab, 0);
}
}
+static void xtotals(void)
+{
+ totals();
+
+ link_slabs();
+ rename_slabs();
+
+ printf("\nSlabs sorted by size\n");
+ printf("--------------------\n");
+ sort_loss = 0;
+ sort_size = 1;
+ sort_slabs();
+ output_slabs();
+
+ printf("\nSlabs sorted by loss\n");
+ printf("--------------------\n");
+ line = 0;
+ sort_loss = 1;
+ sort_size = 0;
+ sort_slabs();
+ output_slabs();
+ printf("\n");
+}
+
struct option opts[] = {
- { "aliases", 0, NULL, 'a' },
- { "activity", 0, NULL, 'A' },
- { "debug", 2, NULL, 'd' },
- { "display-activity", 0, NULL, 'D' },
- { "empty", 0, NULL, 'e' },
- { "first-alias", 0, NULL, 'f' },
- { "help", 0, NULL, 'h' },
- { "inverted", 0, NULL, 'i'},
- { "numa", 0, NULL, 'n' },
- { "ops", 0, NULL, 'o' },
- { "report", 0, NULL, 'r' },
- { "shrink", 0, NULL, 's' },
- { "slabs", 0, NULL, 'l' },
- { "track", 0, NULL, 't'},
- { "validate", 0, NULL, 'v' },
- { "zero", 0, NULL, 'z' },
- { "1ref", 0, NULL, '1'},
+ { "aliases", no_argument, NULL, 'a' },
+ { "activity", no_argument, NULL, 'A' },
+ { "debug", optional_argument, NULL, 'd' },
+ { "display-activity", no_argument, NULL, 'D' },
+ { "empty", no_argument, NULL, 'e' },
+ { "first-alias", no_argument, NULL, 'f' },
+ { "help", no_argument, NULL, 'h' },
+ { "inverted", no_argument, NULL, 'i'},
+ { "slabs", no_argument, NULL, 'l' },
+ { "numa", no_argument, NULL, 'n' },
+ { "ops", no_argument, NULL, 'o' },
+ { "shrink", no_argument, NULL, 's' },
+ { "report", no_argument, NULL, 'r' },
+ { "Size", no_argument, NULL, 'S'},
+ { "tracking", no_argument, NULL, 't'},
+ { "Totals", no_argument, NULL, 'T'},
+ { "validate", no_argument, NULL, 'v' },
+ { "zero", no_argument, NULL, 'z' },
+ { "1ref", no_argument, NULL, '1'},
+ { "lines", required_argument, NULL, 'N'},
+ { "Loss", no_argument, NULL, 'L'},
+ { "Xtotals", no_argument, NULL, 'X'},
+ { "Bytes", no_argument, NULL, 'B'},
{ NULL, 0, NULL, 0 }
};
page_size = getpagesize();
- while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+ while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB",
opts, NULL)) != -1)
switch (c) {
case '1':
case 'S':
sort_size = 1;
break;
-
+ case 'N':
+ if (optarg) {
+ output_lines = atoi(optarg);
+ if (output_lines < 1)
+ output_lines = 1;
+ }
+ break;
+ case 'L':
+ sort_loss = 1;
+ break;
+ case 'X':
+ if (output_lines == -1)
+ output_lines = 1;
+ extended_totals = 1;
+ show_bytes = 1;
+ break;
+ case 'B':
+ show_bytes = 1;
+ break;
default:
fatal("%s: Invalid option '%c'\n", argv[0], optopt);
fatal("%s: Invalid pattern '%s' code %d\n",
argv[0], pattern_source, err);
read_slab_dir();
- if (show_alias)
+ if (show_alias) {
alias();
- else
- if (show_totals)
+ } else if (extended_totals) {
+ xtotals();
+ } else if (show_totals) {
totals();
- else {
+ } else {
link_slabs();
rename_slabs();
sort_slabs();