+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+ struct address_space *mapping;
+ unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+ wait_queue_t wait;
+ struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+ int sync, void *keyp)
+{
+ struct exceptional_entry_key *key = keyp;
+ struct wait_exceptional_entry_queue *ewait =
+ container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+ if (key->mapping != ewait->key.mapping ||
+ key->index != ewait->key.index)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry |= RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+ unsigned long entry = (unsigned long)
+ radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+ entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+ radix_tree_replace_slot(slot, (void *)entry);
+ return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void ***slotp)
+{
+ void *ret, **slot;
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+ ewait.key.mapping = mapping;
+ ewait.key.index = index;
+
+ for (;;) {
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ &slot);
+ if (!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot)) {
+ if (slotp)
+ *slotp = slot;
+ return ret;
+ }
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&mapping->tree_lock);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+restart:
+ spin_lock_irq(&mapping->tree_lock);
+ ret = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* No entry for given index? Make sure radix tree is big enough. */
+ if (!ret) {
+ int err;
+
+ spin_unlock_irq(&mapping->tree_lock);
+ err = radix_tree_preload(
+ mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+ if (err)
+ return ERR_PTR(err);
+ ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+ RADIX_DAX_ENTRY_LOCK);
+ spin_lock_irq(&mapping->tree_lock);
+ err = radix_tree_insert(&mapping->page_tree, index, ret);
+ radix_tree_preload_end();
+ if (err) {
+ spin_unlock_irq(&mapping->tree_lock);
+ /* Someone already created the entry? */
+ if (err == -EEXIST)
+ goto restart;
+ return ERR_PTR(err);
+ }
+ /* Good, we have inserted empty locked entry into the tree. */
+ mapping->nrexceptional++;
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+ }
+ /* Normal page in radix tree? */
+ if (!radix_tree_exceptional_entry(ret)) {
+ struct page *page = ret;
+
+ get_page(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ lock_page(page);
+ /* Page got truncated? Retry... */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto restart;
+ }
+ return page;
+ }
+ ret = lock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, bool wake_all)
+{
+ wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq)) {
+ struct exceptional_entry_key key;
+
+ key.mapping = mapping;
+ key.index = index;
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+ }
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *ret, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry)) {
+ unlock_page(entry);
+ put_page(entry);
+ } else {
+ dax_unlock_mapping_entry(mapping, index);
+ }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry))
+ return;
+
+ /* We have to wake up next waiter for the radix tree entry lock */
+ dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ void *entry;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ /*
+ * This gets called from truncate / punch_hole path. As such, the caller
+ * must hold locks protecting against concurrent modifications of the
+ * radix tree (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen exceptional entry for this index, we better find it
+ * at that index as well...
+ */
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+ }
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, true);
+
+ return 1;
+}
+