fs/btrfs/volumes.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/bio.h>
  20 #include <linux/slab.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/random.h>
  24 #include <linux/iocontext.h>
  25 #include <linux/capability.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include <linux/raid/pq.h>
  29 #include <linux/semaphore.h>
  30 #include <asm/div64.h>
  31 #include "ctree.h"
  32 #include "extent_map.h"
  33 #include "disk-io.h"
  34 #include "transaction.h"
  35 #include "print-tree.h"
  36 #include "volumes.h"
  37 #include "raid56.h"
  38 #include "async-thread.h"
  39 #include "check-integrity.h"
  40 #include "rcu-string.h"
  41 #include "math.h"
  42 #include "dev-replace.h"
  43 #include "sysfs.h"
  44
  45 static int init_first_rw_device(struct btrfs_trans_handle *trans,
  46                                 struct btrfs_root *root,
  47                                 struct btrfs_device *device);
  48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
  51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  52
  53 static DEFINE_MUTEX(uuid_mutex);
  54 static LIST_HEAD(fs_uuids);
  55
  56 static void lock_chunks(struct btrfs_root *root)
  57 {
  58         mutex_lock(&root->fs_info->chunk_mutex);
  59 }
  60
  61 static void unlock_chunks(struct btrfs_root *root)
  62 {
  63         mutex_unlock(&root->fs_info->chunk_mutex);
  64 }
  65
  66 static struct btrfs_fs_devices *__alloc_fs_devices(void)
  67 {
  68         struct btrfs_fs_devices *fs_devs;
  69
  70         fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
  71         if (!fs_devs)
  72                 return ERR_PTR(-ENOMEM);
  73
  74         mutex_init(&fs_devs->device_list_mutex);
  75
  76         INIT_LIST_HEAD(&fs_devs->devices);
  77         INIT_LIST_HEAD(&fs_devs->alloc_list);
  78         INIT_LIST_HEAD(&fs_devs->list);
  79
  80         return fs_devs;
  81 }
  82
  83 /**
  84  * alloc_fs_devices - allocate struct btrfs_fs_devices
  85  * @fsid:       a pointer to UUID for this FS.  If NULL a new UUID is
  86  *              generated.
  87  *
  88  * Return: a pointer to a new &struct btrfs_fs_devices on success;
  89  * ERR_PTR() on error.  Returned struct is not linked onto any lists and
  90  * can be destroyed with kfree() right away.
  91  */
  92 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
  93 {
  94         struct btrfs_fs_devices *fs_devs;
  95
  96         fs_devs = __alloc_fs_devices();
  97         if (IS_ERR(fs_devs))
  98                 return fs_devs;
  99
 100         if (fsid)
 101                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 102         else
 103                 generate_random_uuid(fs_devs->fsid);
 104
 105         return fs_devs;
 106 }
 107
 108 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 109 {
 110         struct btrfs_device *device;
 111         WARN_ON(fs_devices->opened);
 112         while (!list_empty(&fs_devices->devices)) {
 113                 device = list_entry(fs_devices->devices.next,
 114                                     struct btrfs_device, dev_list);
 115                 list_del(&device->dev_list);
 116                 rcu_string_free(device->name);
 117                 kfree(device);
 118         }
 119         kfree(fs_devices);
 120 }
 121
 122 static void btrfs_kobject_uevent(struct block_device *bdev,
 123                                  enum kobject_action action)
 124 {
 125         int ret;
 126
 127         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 128         if (ret)
 129                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 130                         action,
 131                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 132                         &disk_to_dev(bdev->bd_disk)->kobj);
 133 }
 134
 135 void btrfs_cleanup_fs_uuids(void)
 136 {
 137         struct btrfs_fs_devices *fs_devices;
 138
 139         while (!list_empty(&fs_uuids)) {
 140                 fs_devices = list_entry(fs_uuids.next,
 141                                         struct btrfs_fs_devices, list);
 142                 list_del(&fs_devices->list);
 143                 free_fs_devices(fs_devices);
 144         }
 145 }
 146
 147 static struct btrfs_device *__alloc_device(void)
 148 {
 149         struct btrfs_device *dev;
 150
 151         dev = kzalloc(sizeof(*dev), GFP_NOFS);
 152         if (!dev)
 153                 return ERR_PTR(-ENOMEM);
 154
 155         INIT_LIST_HEAD(&dev->dev_list);
 156         INIT_LIST_HEAD(&dev->dev_alloc_list);
 157
 158         spin_lock_init(&dev->io_lock);
 159
 160         spin_lock_init(&dev->reada_lock);
 161         atomic_set(&dev->reada_in_flight, 0);
 162         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 163         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 164
 165         return dev;
 166 }
 167
 168 static noinline struct btrfs_device *__find_device(struct list_head *head,
 169                                                    u64 devid, u8 *uuid)
 170 {
 171         struct btrfs_device *dev;
 172
 173         list_for_each_entry(dev, head, dev_list) {
 174                 if (dev->devid == devid &&
 175                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 176                         return dev;
 177                 }
 178         }
 179         return NULL;
 180 }
 181
 182 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 183 {
 184         struct btrfs_fs_devices *fs_devices;
 185
 186         list_for_each_entry(fs_devices, &fs_uuids, list) {
 187                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 188                         return fs_devices;
 189         }
 190         return NULL;
 191 }
 192
 193 static int
 194 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 195                       int flush, struct block_device **bdev,
 196                       struct buffer_head **bh)
 197 {
 198         int ret;
 199
 200         *bdev = blkdev_get_by_path(device_path, flags, holder);
 201
 202         if (IS_ERR(*bdev)) {
 203                 ret = PTR_ERR(*bdev);
 204                 printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
 205                 goto error;
 206         }
 207
 208         if (flush)
 209                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 210         ret = set_blocksize(*bdev, 4096);
 211         if (ret) {
 212                 blkdev_put(*bdev, flags);
 213                 goto error;
 214         }
 215         invalidate_bdev(*bdev);
 216         *bh = btrfs_read_dev_super(*bdev);
 217         if (!*bh) {
 218                 ret = -EINVAL;
 219                 blkdev_put(*bdev, flags);
 220                 goto error;
 221         }
 222
 223         return 0;
 224
 225 error:
 226         *bdev = NULL;
 227         *bh = NULL;
 228         return ret;
 229 }
 230
 231 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 232                         struct bio *head, struct bio *tail)
 233 {
 234
 235         struct bio *old_head;
 236
 237         old_head = pending_bios->head;
 238         pending_bios->head = head;
 239         if (pending_bios->tail)
 240                 tail->bi_next = old_head;
 241         else
 242                 pending_bios->tail = tail;
 243 }
 244
 245 /*
 246  * we try to collect pending bios for a device so we don't get a large
 247  * number of procs sending bios down to the same device.  This greatly
 248  * improves the schedulers ability to collect and merge the bios.
 249  *
 250  * But, it also turns into a long list of bios to process and that is sure
 251  * to eventually make the worker thread block.  The solution here is to
 252  * make some progress and then put this work struct back at the end of
 253  * the list if the block device is congested.  This way, multiple devices
 254  * can make progress from a single worker thread.
 255  */
 256 static noinline void run_scheduled_bios(struct btrfs_device *device)
 257 {
 258         struct bio *pending;
 259         struct backing_dev_info *bdi;
 260         struct btrfs_fs_info *fs_info;
 261         struct btrfs_pending_bios *pending_bios;
 262         struct bio *tail;
 263         struct bio *cur;
 264         int again = 0;
 265         unsigned long num_run;
 266         unsigned long batch_run = 0;
 267         unsigned long limit;
 268         unsigned long last_waited = 0;
 269         int force_reg = 0;
 270         int sync_pending = 0;
 271         struct blk_plug plug;
 272
 273         /*
 274          * this function runs all the bios we've collected for
 275          * a particular device.  We don't want to wander off to
 276          * another device without first sending all of these down.
 277          * So, setup a plug here and finish it off before we return
 278          */
 279         blk_start_plug(&plug);
 280
 281         bdi = blk_get_backing_dev_info(device->bdev);
 282         fs_info = device->dev_root->fs_info;
 283         limit = btrfs_async_submit_limit(fs_info);
 284         limit = limit * 2 / 3;
 285
 286 loop:
 287         spin_lock(&device->io_lock);
 288
 289 loop_lock:
 290         num_run = 0;
 291
 292         /* take all the bios off the list at once and process them
 293          * later on (without the lock held).  But, remember the
 294          * tail and other pointers so the bios can be properly reinserted
 295          * into the list if we hit congestion
 296          */
 297         if (!force_reg && device->pending_sync_bios.head) {
 298                 pending_bios = &device->pending_sync_bios;
 299                 force_reg = 1;
 300         } else {
 301                 pending_bios = &device->pending_bios;
 302                 force_reg = 0;
 303         }
 304
 305         pending = pending_bios->head;
 306         tail = pending_bios->tail;
 307         WARN_ON(pending && !tail);
 308
 309         /*
 310          * if pending was null this time around, no bios need processing
 311          * at all and we can stop.  Otherwise it'll loop back up again
 312          * and do an additional check so no bios are missed.
 313          *
 314          * device->running_pending is used to synchronize with the
 315          * schedule_bio code.
 316          */
 317         if (device->pending_sync_bios.head == NULL &&
 318             device->pending_bios.head == NULL) {
 319                 again = 0;
 320                 device->running_pending = 0;
 321         } else {
 322                 again = 1;
 323                 device->running_pending = 1;
 324         }
 325
 326         pending_bios->head = NULL;
 327         pending_bios->tail = NULL;
 328
 329         spin_unlock(&device->io_lock);
 330
 331         while (pending) {
 332
 333                 rmb();
 334                 /* we want to work on both lists, but do more bios on the
 335                  * sync list than the regular list
 336                  */
 337                 if ((num_run > 32 &&
 338                     pending_bios != &device->pending_sync_bios &&
 339                     device->pending_sync_bios.head) ||
 340                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 341                     device->pending_bios.head)) {
 342                         spin_lock(&device->io_lock);
 343                         requeue_list(pending_bios, pending, tail);
 344                         goto loop_lock;
 345                 }
 346
 347                 cur = pending;
 348                 pending = pending->bi_next;
 349                 cur->bi_next = NULL;
 350
 351                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 352                     waitqueue_active(&fs_info->async_submit_wait))
 353                         wake_up(&fs_info->async_submit_wait);
 354
 355                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 356
 357                 /*
 358                  * if we're doing the sync list, record that our
 359                  * plug has some sync requests on it
 360                  *
 361                  * If we're doing the regular list and there are
 362                  * sync requests sitting around, unplug before
 363                  * we add more
 364                  */
 365                 if (pending_bios == &device->pending_sync_bios) {
 366                         sync_pending = 1;
 367                 } else if (sync_pending) {
 368                         blk_finish_plug(&plug);
 369                         blk_start_plug(&plug);
 370                         sync_pending = 0;
 371                 }
 372
 373                 btrfsic_submit_bio(cur->bi_rw, cur);
 374                 num_run++;
 375                 batch_run++;
 376                 if (need_resched())
 377                         cond_resched();
 378
 379                 /*
 380                  * we made progress, there is more work to do and the bdi
 381                  * is now congested.  Back off and let other work structs
 382                  * run instead
 383                  */
 384                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 385                     fs_info->fs_devices->open_devices > 1) {
 386                         struct io_context *ioc;
 387
 388                         ioc = current->io_context;
 389
 390                         /*
 391                          * the main goal here is that we don't want to
 392                          * block if we're going to be able to submit
 393                          * more requests without blocking.
 394                          *
 395                          * This code does two great things, it pokes into
 396                          * the elevator code from a filesystem _and_
 397                          * it makes assumptions about how batching works.
 398                          */
 399                         if (ioc && ioc->nr_batch_requests > 0 &&
 400                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 401                             (last_waited == 0 ||
 402                              ioc->last_waited == last_waited)) {
 403                                 /*
 404                                  * we want to go through our batch of
 405                                  * requests and stop.  So, we copy out
 406                                  * the ioc->last_waited time and test
 407                                  * against it before looping
 408                                  */
 409                                 last_waited = ioc->last_waited;
 410                                 if (need_resched())
 411                                         cond_resched();
 412                                 continue;
 413                         }
 414                         spin_lock(&device->io_lock);
 415                         requeue_list(pending_bios, pending, tail);
 416                         device->running_pending = 1;
 417
 418                         spin_unlock(&device->io_lock);
 419                         btrfs_queue_work(fs_info->submit_workers,
 420                                          &device->work);
 421                         goto done;
 422                 }
 423                 /* unplug every 64 requests just for good measure */
 424                 if (batch_run % 64 == 0) {
 425                         blk_finish_plug(&plug);
 426                         blk_start_plug(&plug);
 427                         sync_pending = 0;
 428                 }
 429         }
 430
 431         cond_resched();
 432         if (again)
 433                 goto loop;
 434
 435         spin_lock(&device->io_lock);
 436         if (device->pending_bios.head || device->pending_sync_bios.head)
 437                 goto loop_lock;
 438         spin_unlock(&device->io_lock);
 439
 440 done:
 441         blk_finish_plug(&plug);
 442 }
 443
 444 static void pending_bios_fn(struct btrfs_work *work)
 445 {
 446         struct btrfs_device *device;
 447
 448         device = container_of(work, struct btrfs_device, work);
 449         run_scheduled_bios(device);
 450 }
 451
 452 /*
 453  * Add new device to list of registered devices
 454  *
 455  * Returns:
 456  * 1   - first time device is seen
 457  * 0   - device already known
 458  * < 0 - error
 459  */
 460 static noinline int device_list_add(const char *path,
 461                            struct btrfs_super_block *disk_super,
 462                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 463 {
 464         struct btrfs_device *device;
 465         struct btrfs_fs_devices *fs_devices;
 466         struct rcu_string *name;
 467         int ret = 0;
 468         u64 found_transid = btrfs_super_generation(disk_super);
 469
 470         fs_devices = find_fsid(disk_super->fsid);
 471         if (!fs_devices) {
 472                 fs_devices = alloc_fs_devices(disk_super->fsid);
 473                 if (IS_ERR(fs_devices))
 474                         return PTR_ERR(fs_devices);
 475
 476                 list_add(&fs_devices->list, &fs_uuids);
 477                 fs_devices->latest_devid = devid;
 478                 fs_devices->latest_trans = found_transid;
 479
 480                 device = NULL;
 481         } else {
 482                 device = __find_device(&fs_devices->devices, devid,
 483                                        disk_super->dev_item.uuid);
 484         }
 485         if (!device) {
 486                 if (fs_devices->opened)
 487                         return -EBUSY;
 488
 489                 device = btrfs_alloc_device(NULL, &devid,
 490                                             disk_super->dev_item.uuid);
 491                 if (IS_ERR(device)) {
 492                         /* we can safely leave the fs_devices entry around */
 493                         return PTR_ERR(device);
 494                 }
 495
 496                 name = rcu_string_strdup(path, GFP_NOFS);
 497                 if (!name) {
 498                         kfree(device);
 499                         return -ENOMEM;
 500                 }
 501                 rcu_assign_pointer(device->name, name);
 502
 503                 mutex_lock(&fs_devices->device_list_mutex);
 504                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 505                 fs_devices->num_devices++;
 506                 mutex_unlock(&fs_devices->device_list_mutex);
 507
 508                 ret = 1;
 509                 device->fs_devices = fs_devices;
 510         } else if (!device->name || strcmp(device->name->str, path)) {
 511                 /*
 512                  * When FS is already mounted.
 513                  * 1. If you are here and if the device->name is NULL that
 514                  *    means this device was missing at time of FS mount.
 515                  * 2. If you are here and if the device->name is different
 516                  *    from 'path' that means either
 517                  *      a. The same device disappeared and reappeared with
 518                  *         different name. or
 519                  *      b. The missing-disk-which-was-replaced, has
 520                  *         reappeared now.
 521                  *
 522                  * We must allow 1 and 2a above. But 2b would be a spurious
 523                  * and unintentional.
 524                  *
 525                  * Further in case of 1 and 2a above, the disk at 'path'
 526                  * would have missed some transaction when it was away and
 527                  * in case of 2a the stale bdev has to be updated as well.
 528                  * 2b must not be allowed at all time.
 529                  */
 530
 531                 /*
 532                  * As of now don't allow update to btrfs_fs_device through
 533                  * the btrfs dev scan cli, after FS has been mounted.
 534                  */
 535                 if (fs_devices->opened) {
 536                         return -EBUSY;
 537                 } else {
 538                         /*
 539                          * That is if the FS is _not_ mounted and if you
 540                          * are here, that means there is more than one
 541                          * disk with same uuid and devid.We keep the one
 542                          * with larger generation number or the last-in if
 543                          * generation are equal.
 544                          */
 545                         if (found_transid < device->generation)
 546                                 return -EEXIST;
 547                 }
 548
 549                 name = rcu_string_strdup(path, GFP_NOFS);
 550                 if (!name)
 551                         return -ENOMEM;
 552                 rcu_string_free(device->name);
 553                 rcu_assign_pointer(device->name, name);
 554                 if (device->missing) {
 555                         fs_devices->missing_devices--;
 556                         device->missing = 0;
 557                 }
 558         }
 559
 560         /*
 561          * Unmount does not free the btrfs_device struct but would zero
 562          * generation along with most of the other members. So just update
 563          * it back. We need it to pick the disk with largest generation
 564          * (as above).
 565          */
 566         if (!fs_devices->opened)
 567                 device->generation = found_transid;
 568
 569         if (found_transid > fs_devices->latest_trans) {
 570                 fs_devices->latest_devid = devid;
 571                 fs_devices->latest_trans = found_transid;
 572         }
 573         *fs_devices_ret = fs_devices;
 574
 575         return ret;
 576 }
 577
 578 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 579 {
 580         struct btrfs_fs_devices *fs_devices;
 581         struct btrfs_device *device;
 582         struct btrfs_device *orig_dev;
 583
 584         fs_devices = alloc_fs_devices(orig->fsid);
 585         if (IS_ERR(fs_devices))
 586                 return fs_devices;
 587
 588         fs_devices->latest_devid = orig->latest_devid;
 589         fs_devices->latest_trans = orig->latest_trans;
 590         fs_devices->total_devices = orig->total_devices;
 591
 592         /* We have held the volume lock, it is safe to get the devices. */
 593         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 594                 struct rcu_string *name;
 595
 596                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 597                                             orig_dev->uuid);
 598                 if (IS_ERR(device))
 599                         goto error;
 600
 601                 /*
 602                  * This is ok to do without rcu read locked because we hold the
 603                  * uuid mutex so nothing we touch in here is going to disappear.
 604                  */
 605                 if (orig_dev->name) {
 606                         name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 607                         if (!name) {
 608                                 kfree(device);
 609                                 goto error;
 610                         }
 611                         rcu_assign_pointer(device->name, name);
 612                 }
 613
 614                 list_add(&device->dev_list, &fs_devices->devices);
 615                 device->fs_devices = fs_devices;
 616                 fs_devices->num_devices++;
 617         }
 618         return fs_devices;
 619 error:
 620         free_fs_devices(fs_devices);
 621         return ERR_PTR(-ENOMEM);
 622 }
 623
 624 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 625                                struct btrfs_fs_devices *fs_devices, int step)
 626 {
 627         struct btrfs_device *device, *next;
 628
 629         struct block_device *latest_bdev = NULL;
 630         u64 latest_devid = 0;
 631         u64 latest_transid = 0;
 632
 633         mutex_lock(&uuid_mutex);
 634 again:
 635         /* This is the initialized path, it is safe to release the devices. */
 636         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 637                 if (device->in_fs_metadata) {
 638                         if (!device->is_tgtdev_for_dev_replace &&
 639                             (!latest_transid ||
 640                              device->generation > latest_transid)) {
 641                                 latest_devid = device->devid;
 642                                 latest_transid = device->generation;
 643                                 latest_bdev = device->bdev;
 644                         }
 645                         continue;
 646                 }
 647
 648                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 649                         /*
 650                          * In the first step, keep the device which has
 651                          * the correct fsid and the devid that is used
 652                          * for the dev_replace procedure.
 653                          * In the second step, the dev_replace state is
 654                          * read from the device tree and it is known
 655                          * whether the procedure is really active or
 656                          * not, which means whether this device is
 657                          * used or whether it should be removed.
 658                          */
 659                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
 660                                 continue;
 661                         }
 662                 }
 663                 if (device->bdev) {
 664                         blkdev_put(device->bdev, device->mode);
 665                         device->bdev = NULL;
 666                         fs_devices->open_devices--;
 667                 }
 668                 if (device->writeable) {
 669                         list_del_init(&device->dev_alloc_list);
 670                         device->writeable = 0;
 671                         if (!device->is_tgtdev_for_dev_replace)
 672                                 fs_devices->rw_devices--;
 673                 }
 674                 list_del_init(&device->dev_list);
 675                 fs_devices->num_devices--;
 676                 rcu_string_free(device->name);
 677                 kfree(device);
 678         }
 679
 680         if (fs_devices->seed) {
 681                 fs_devices = fs_devices->seed;
 682                 goto again;
 683         }
 684
 685         fs_devices->latest_bdev = latest_bdev;
 686         fs_devices->latest_devid = latest_devid;
 687         fs_devices->latest_trans = latest_transid;
 688
 689         mutex_unlock(&uuid_mutex);
 690 }
 691
 692 static void __free_device(struct work_struct *work)
 693 {
 694         struct btrfs_device *device;
 695
 696         device = container_of(work, struct btrfs_device, rcu_work);
 697
 698         if (device->bdev)
 699                 blkdev_put(device->bdev, device->mode);
 700
 701         rcu_string_free(device->name);
 702         kfree(device);
 703 }
 704
 705 static void free_device(struct rcu_head *head)
 706 {
 707         struct btrfs_device *device;
 708
 709         device = container_of(head, struct btrfs_device, rcu);
 710
 711         INIT_WORK(&device->rcu_work, __free_device);
 712         schedule_work(&device->rcu_work);
 713 }
 714
 715 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 716 {
 717         struct btrfs_device *device;
 718
 719         if (--fs_devices->opened > 0)
 720                 return 0;
 721
 722         mutex_lock(&fs_devices->device_list_mutex);
 723         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 724                 struct btrfs_device *new_device;
 725                 struct rcu_string *name;
 726
 727                 if (device->bdev)
 728                         fs_devices->open_devices--;
 729
 730                 if (device->writeable &&
 731                     device->devid != BTRFS_DEV_REPLACE_DEVID) {
 732                         list_del_init(&device->dev_alloc_list);
 733                         fs_devices->rw_devices--;
 734                 }
 735
 736                 if (device->can_discard)
 737                         fs_devices->num_can_discard--;
 738                 if (device->missing)
 739                         fs_devices->missing_devices--;
 740
 741                 new_device = btrfs_alloc_device(NULL, &device->devid,
 742                                                 device->uuid);
 743                 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
 744
 745                 /* Safe because we are under uuid_mutex */
 746                 if (device->name) {
 747                         name = rcu_string_strdup(device->name->str, GFP_NOFS);
 748                         BUG_ON(!name); /* -ENOMEM */
 749                         rcu_assign_pointer(new_device->name, name);
 750                 }
 751
 752                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
 753                 new_device->fs_devices = device->fs_devices;
 754
 755                 call_rcu(&device->rcu, free_device);
 756         }
 757         mutex_unlock(&fs_devices->device_list_mutex);
 758
 759         WARN_ON(fs_devices->open_devices);
 760         WARN_ON(fs_devices->rw_devices);
 761         fs_devices->opened = 0;
 762         fs_devices->seeding = 0;
 763
 764         return 0;
 765 }
 766
 767 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 768 {
 769         struct btrfs_fs_devices *seed_devices = NULL;
 770         int ret;
 771
 772         mutex_lock(&uuid_mutex);
 773         ret = __btrfs_close_devices(fs_devices);
 774         if (!fs_devices->opened) {
 775                 seed_devices = fs_devices->seed;
 776                 fs_devices->seed = NULL;
 777         }
 778         mutex_unlock(&uuid_mutex);
 779
 780         while (seed_devices) {
 781                 fs_devices = seed_devices;
 782                 seed_devices = fs_devices->seed;
 783                 __btrfs_close_devices(fs_devices);
 784                 free_fs_devices(fs_devices);
 785         }
 786         /*
 787          * Wait for rcu kworkers under __btrfs_close_devices
 788          * to finish all blkdev_puts so device is really
 789          * free when umount is done.
 790          */
 791         rcu_barrier();
 792         return ret;
 793 }
 794
 795 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 796                                 fmode_t flags, void *holder)
 797 {
 798         struct request_queue *q;
 799         struct block_device *bdev;
 800         struct list_head *head = &fs_devices->devices;
 801         struct btrfs_device *device;
 802         struct block_device *latest_bdev = NULL;
 803         struct buffer_head *bh;
 804         struct btrfs_super_block *disk_super;
 805         u64 latest_devid = 0;
 806         u64 latest_transid = 0;
 807         u64 devid;
 808         int seeding = 1;
 809         int ret = 0;
 810
 811         flags |= FMODE_EXCL;
 812
 813         list_for_each_entry(device, head, dev_list) {
 814                 if (device->bdev)
 815                         continue;
 816                 if (!device->name)
 817                         continue;
 818
 819                 /* Just open everything we can; ignore failures here */
 820                 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 821                                             &bdev, &bh))
 822                         continue;
 823
 824                 disk_super = (struct btrfs_super_block *)bh->b_data;
 825                 devid = btrfs_stack_device_id(&disk_super->dev_item);
 826                 if (devid != device->devid)
 827                         goto error_brelse;
 828
 829                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
 830                            BTRFS_UUID_SIZE))
 831                         goto error_brelse;
 832
 833                 device->generation = btrfs_super_generation(disk_super);
 834                 if (!latest_transid || device->generation > latest_transid) {
 835                         latest_devid = devid;
 836                         latest_transid = device->generation;
 837                         latest_bdev = bdev;
 838                 }
 839
 840                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 841                         device->writeable = 0;
 842                 } else {
 843                         device->writeable = !bdev_read_only(bdev);
 844                         seeding = 0;
 845                 }
 846
 847                 q = bdev_get_queue(bdev);
 848                 if (blk_queue_discard(q)) {
 849                         device->can_discard = 1;
 850                         fs_devices->num_can_discard++;
 851                 }
 852
 853                 device->bdev = bdev;
 854                 device->in_fs_metadata = 0;
 855                 device->mode = flags;
 856
 857                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 858                         fs_devices->rotating = 1;
 859
 860                 fs_devices->open_devices++;
 861                 if (device->writeable &&
 862                     device->devid != BTRFS_DEV_REPLACE_DEVID) {
 863                         fs_devices->rw_devices++;
 864                         list_add(&device->dev_alloc_list,
 865                                  &fs_devices->alloc_list);
 866                 }
 867                 brelse(bh);
 868                 continue;
 869
 870 error_brelse:
 871                 brelse(bh);
 872                 blkdev_put(bdev, flags);
 873                 continue;
 874         }
 875         if (fs_devices->open_devices == 0) {
 876                 ret = -EINVAL;
 877                 goto out;
 878         }
 879         fs_devices->seeding = seeding;
 880         fs_devices->opened = 1;
 881         fs_devices->latest_bdev = latest_bdev;
 882         fs_devices->latest_devid = latest_devid;
 883         fs_devices->latest_trans = latest_transid;
 884         fs_devices->total_rw_bytes = 0;
 885 out:
 886         return ret;
 887 }
 888
 889 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 890                        fmode_t flags, void *holder)
 891 {
 892         int ret;
 893
 894         mutex_lock(&uuid_mutex);
 895         if (fs_devices->opened) {
 896                 fs_devices->opened++;
 897                 ret = 0;
 898         } else {
 899                 ret = __btrfs_open_devices(fs_devices, flags, holder);
 900         }
 901         mutex_unlock(&uuid_mutex);
 902         return ret;
 903 }
 904
 905 /*
 906  * Look for a btrfs signature on a device. This may be called out of the mount path
 907  * and we are not allowed to call set_blocksize during the scan. The superblock
 908  * is read via pagecache
 909  */
 910 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 911                           struct btrfs_fs_devices **fs_devices_ret)
 912 {
 913         struct btrfs_super_block *disk_super;
 914         struct block_device *bdev;
 915         struct page *page;
 916         void *p;
 917         int ret = -EINVAL;
 918         u64 devid;
 919         u64 transid;
 920         u64 total_devices;
 921         u64 bytenr;
 922         pgoff_t index;
 923
 924         /*
 925          * we would like to check all the supers, but that would make
 926          * a btrfs mount succeed after a mkfs from a different FS.
 927          * So, we need to add a special mount option to scan for
 928          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 929          */
 930         bytenr = btrfs_sb_offset(0);
 931         flags |= FMODE_EXCL;
 932         mutex_lock(&uuid_mutex);
 933
 934         bdev = blkdev_get_by_path(path, flags, holder);
 935
 936         if (IS_ERR(bdev)) {
 937                 ret = PTR_ERR(bdev);
 938                 goto error;
 939         }
 940
 941         /* make sure our super fits in the device */
 942         if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
 943                 goto error_bdev_put;
 944
 945         /* make sure our super fits in the page */
 946         if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
 947                 goto error_bdev_put;
 948
 949         /* make sure our super doesn't straddle pages on disk */
 950         index = bytenr >> PAGE_CACHE_SHIFT;
 951         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
 952                 goto error_bdev_put;
 953
 954         /* pull in the page with our super */
 955         page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
 956                                    index, GFP_NOFS);
 957
 958         if (IS_ERR_OR_NULL(page))
 959                 goto error_bdev_put;
 960
 961         p = kmap(page);
 962
 963         /* align our pointer to the offset of the super block */
 964         disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 965
 966         if (btrfs_super_bytenr(disk_super) != bytenr ||
 967             btrfs_super_magic(disk_super) != BTRFS_MAGIC)
 968                 goto error_unmap;
 969
 970         devid = btrfs_stack_device_id(&disk_super->dev_item);
 971         transid = btrfs_super_generation(disk_super);
 972         total_devices = btrfs_super_num_devices(disk_super);
 973
 974         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 975         if (ret > 0) {
 976                 if (disk_super->label[0]) {
 977                         if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 978                                 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 979                         printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
 980                 } else {
 981                         printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
 982                 }
 983
 984                 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
 985                 ret = 0;
 986         }
 987         if (!ret && fs_devices_ret)
 988                 (*fs_devices_ret)->total_devices = total_devices;
 989
 990 error_unmap:
 991         kunmap(page);
 992         page_cache_release(page);
 993
 994 error_bdev_put:
 995         blkdev_put(bdev, flags);
 996 error:
 997         mutex_unlock(&uuid_mutex);
 998         return ret;
 999 }
1000
1001 /* helper to account the used device space in the range */
1002 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1003                                    u64 end, u64 *length)
1004 {
1005         struct btrfs_key key;
1006         struct btrfs_root *root = device->dev_root;
1007         struct btrfs_dev_extent *dev_extent;
1008         struct btrfs_path *path;
1009         u64 extent_end;
1010         int ret;
1011         int slot;
1012         struct extent_buffer *l;
1013
1014         *length = 0;
1015
1016         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1017                 return 0;
1018
1019         path = btrfs_alloc_path();
1020         if (!path)
1021                 return -ENOMEM;
1022         path->reada = 2;
1023
1024         key.objectid = device->devid;
1025         key.offset = start;
1026         key.type = BTRFS_DEV_EXTENT_KEY;
1027
1028         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1029         if (ret < 0)
1030                 goto out;
1031         if (ret > 0) {
1032                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1033                 if (ret < 0)
1034                         goto out;
1035         }
1036
1037         while (1) {
1038                 l = path->nodes[0];
1039                 slot = path->slots[0];
1040                 if (slot >= btrfs_header_nritems(l)) {
1041                         ret = btrfs_next_leaf(root, path);
1042                         if (ret == 0)
1043                                 continue;
1044                         if (ret < 0)
1045                                 goto out;
1046
1047                         break;
1048                 }
1049                 btrfs_item_key_to_cpu(l, &key, slot);
1050
1051                 if (key.objectid < device->devid)
1052                         goto next;
1053
1054                 if (key.objectid > device->devid)
1055                         break;
1056
1057                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1058                         goto next;
1059
1060                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1061                 extent_end = key.offset + btrfs_dev_extent_length(l,
1062                                                                   dev_extent);
1063                 if (key.offset <= start && extent_end > end) {
1064                         *length = end - start + 1;
1065                         break;
1066                 } else if (key.offset <= start && extent_end > start)
1067                         *length += extent_end - start;
1068                 else if (key.offset > start && extent_end <= end)
1069                         *length += extent_end - key.offset;
1070                 else if (key.offset > start && key.offset <= end) {
1071                         *length += end - key.offset + 1;
1072                         break;
1073                 } else if (key.offset > end)
1074                         break;
1075
1076 next:
1077                 path->slots[0]++;
1078         }
1079         ret = 0;
1080 out:
1081         btrfs_free_path(path);
1082         return ret;
1083 }
1084
1085 static int contains_pending_extent(struct btrfs_trans_handle *trans,
1086                                    struct btrfs_device *device,
1087                                    u64 *start, u64 len)
1088 {
1089         struct extent_map *em;
1090         int ret = 0;
1091
1092         list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
1093                 struct map_lookup *map;
1094                 int i;
1095
1096                 map = (struct map_lookup *)em->bdev;
1097                 for (i = 0; i < map->num_stripes; i++) {
1098                         if (map->stripes[i].dev != device)
1099                                 continue;
1100                         if (map->stripes[i].physical >= *start + len ||
1101                             map->stripes[i].physical + em->orig_block_len <=
1102                             *start)
1103                                 continue;
1104                         *start = map->stripes[i].physical +
1105                                 em->orig_block_len;
1106                         ret = 1;
1107                 }
1108         }
1109
1110         return ret;
1111 }
1112
1113
1114 /*
1115  * find_free_dev_extent - find free space in the specified device
1116  * @device:     the device which we search the free space in
1117  * @num_bytes:  the size of the free space that we need
1118  * @start:      store the start of the free space.
1119  * @len:        the size of the free space. that we find, or the size of the max
1120  *              free space if we don't find suitable free space
1121  *
1122  * this uses a pretty simple search, the expectation is that it is
1123  * called very infrequently and that a given device has a small number
1124  * of extents
1125  *
1126  * @start is used to store the start of the free space if we find. But if we
1127  * don't find suitable free space, it will be used to store the start position
1128  * of the max free space.
1129  *
1130  * @len is used to store the size of the free space that we find.
1131  * But if we don't find suitable free space, it is used to store the size of
1132  * the max free space.
1133  */
1134 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1135                          struct btrfs_device *device, u64 num_bytes,
1136                          u64 *start, u64 *len)
1137 {
1138         struct btrfs_key key;
1139         struct btrfs_root *root = device->dev_root;
1140         struct btrfs_dev_extent *dev_extent;
1141         struct btrfs_path *path;
1142         u64 hole_size;
1143         u64 max_hole_start;
1144         u64 max_hole_size;
1145         u64 extent_end;
1146         u64 search_start;
1147         u64 search_end = device->total_bytes;
1148         int ret;
1149         int slot;
1150         struct extent_buffer *l;
1151
1152         /* FIXME use last free of some kind */
1153
1154         /* we don't want to overwrite the superblock on the drive,
1155          * so we make sure to start at an offset of at least 1MB
1156          */
1157         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1158
1159         path = btrfs_alloc_path();
1160         if (!path)
1161                 return -ENOMEM;
1162 again:
1163         max_hole_start = search_start;
1164         max_hole_size = 0;
1165         hole_size = 0;
1166
1167         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1168                 ret = -ENOSPC;
1169                 goto out;
1170         }
1171
1172         path->reada = 2;
1173         path->search_commit_root = 1;
1174         path->skip_locking = 1;
1175
1176         key.objectid = device->devid;
1177         key.offset = search_start;
1178         key.type = BTRFS_DEV_EXTENT_KEY;
1179
1180         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1181         if (ret < 0)
1182                 goto out;
1183         if (ret > 0) {
1184                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1185                 if (ret < 0)
1186                         goto out;
1187         }
1188
1189         while (1) {
1190                 l = path->nodes[0];
1191                 slot = path->slots[0];
1192                 if (slot >= btrfs_header_nritems(l)) {
1193                         ret = btrfs_next_leaf(root, path);
1194                         if (ret == 0)
1195                                 continue;
1196                         if (ret < 0)
1197                                 goto out;
1198
1199                         break;
1200                 }
1201                 btrfs_item_key_to_cpu(l, &key, slot);
1202
1203                 if (key.objectid < device->devid)
1204                         goto next;
1205
1206                 if (key.objectid > device->devid)
1207                         break;
1208
1209                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1210                         goto next;
1211
1212                 if (key.offset > search_start) {
1213                         hole_size = key.offset - search_start;
1214
1215                         /*
1216                          * Have to check before we set max_hole_start, otherwise
1217                          * we could end up sending back this offset anyway.
1218                          */
1219                         if (contains_pending_extent(trans, device,
1220                                                     &search_start,
1221                                                     hole_size))
1222                                 hole_size = 0;
1223
1224                         if (hole_size > max_hole_size) {
1225                                 max_hole_start = search_start;
1226                                 max_hole_size = hole_size;
1227                         }
1228
1229                         /*
1230                          * If this free space is greater than which we need,
1231                          * it must be the max free space that we have found
1232                          * until now, so max_hole_start must point to the start
1233                          * of this free space and the length of this free space
1234                          * is stored in max_hole_size. Thus, we return
1235                          * max_hole_start and max_hole_size and go back to the
1236                          * caller.
1237                          */
1238                         if (hole_size >= num_bytes) {
1239                                 ret = 0;
1240                                 goto out;
1241                         }
1242                 }
1243
1244                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1245                 extent_end = key.offset + btrfs_dev_extent_length(l,
1246                                                                   dev_extent);
1247                 if (extent_end > search_start)
1248                         search_start = extent_end;
1249 next:
1250                 path->slots[0]++;
1251                 cond_resched();
1252         }
1253
1254         /*
1255          * At this point, search_start should be the end of
1256          * allocated dev extents, and when shrinking the device,
1257          * search_end may be smaller than search_start.
1258          */
1259         if (search_end > search_start)
1260                 hole_size = search_end - search_start;
1261
1262         if (hole_size > max_hole_size) {
1263                 max_hole_start = search_start;
1264                 max_hole_size = hole_size;
1265         }
1266
1267         if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1268                 btrfs_release_path(path);
1269                 goto again;
1270         }
1271
1272         /* See above. */
1273         if (hole_size < num_bytes)
1274                 ret = -ENOSPC;
1275         else
1276                 ret = 0;
1277
1278 out:
1279         btrfs_free_path(path);
1280         *start = max_hole_start;
1281         if (len)
1282                 *len = max_hole_size;
1283         return ret;
1284 }
1285
1286 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1287                           struct btrfs_device *device,
1288                           u64 start)
1289 {
1290         int ret;
1291         struct btrfs_path *path;
1292         struct btrfs_root *root = device->dev_root;
1293         struct btrfs_key key;
1294         struct btrfs_key found_key;
1295         struct extent_buffer *leaf = NULL;
1296         struct btrfs_dev_extent *extent = NULL;
1297
1298         path = btrfs_alloc_path();
1299         if (!path)
1300                 return -ENOMEM;
1301
1302         key.objectid = device->devid;
1303         key.offset = start;
1304         key.type = BTRFS_DEV_EXTENT_KEY;
1305 again:
1306         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1307         if (ret > 0) {
1308                 ret = btrfs_previous_item(root, path, key.objectid,
1309                                           BTRFS_DEV_EXTENT_KEY);
1310                 if (ret)
1311                         goto out;
1312                 leaf = path->nodes[0];
1313                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1314                 extent = btrfs_item_ptr(leaf, path->slots[0],
1315                                         struct btrfs_dev_extent);
1316                 BUG_ON(found_key.offset > start || found_key.offset +
1317                        btrfs_dev_extent_length(leaf, extent) < start);
1318                 key = found_key;
1319                 btrfs_release_path(path);
1320                 goto again;
1321         } else if (ret == 0) {
1322                 leaf = path->nodes[0];
1323                 extent = btrfs_item_ptr(leaf, path->slots[0],
1324                                         struct btrfs_dev_extent);
1325         } else {
1326                 btrfs_error(root->fs_info, ret, "Slot search failed");
1327                 goto out;
1328         }
1329
1330         if (device->bytes_used > 0) {
1331                 u64 len = btrfs_dev_extent_length(leaf, extent);
1332                 device->bytes_used -= len;
1333                 spin_lock(&root->fs_info->free_chunk_lock);
1334                 root->fs_info->free_chunk_space += len;
1335                 spin_unlock(&root->fs_info->free_chunk_lock);
1336         }
1337         ret = btrfs_del_item(trans, root, path);
1338         if (ret) {
1339                 btrfs_error(root->fs_info, ret,
1340                             "Failed to remove dev extent item");
1341         }
1342 out:
1343         btrfs_free_path(path);
1344         return ret;
1345 }
1346
1347 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1348                                   struct btrfs_device *device,
1349                                   u64 chunk_tree, u64 chunk_objectid,
1350                                   u64 chunk_offset, u64 start, u64 num_bytes)
1351 {
1352         int ret;
1353         struct btrfs_path *path;
1354         struct btrfs_root *root = device->dev_root;
1355         struct btrfs_dev_extent *extent;
1356         struct extent_buffer *leaf;
1357         struct btrfs_key key;
1358
1359         WARN_ON(!device->in_fs_metadata);
1360         WARN_ON(device->is_tgtdev_for_dev_replace);
1361         path = btrfs_alloc_path();
1362         if (!path)
1363                 return -ENOMEM;
1364
1365         key.objectid = device->devid;
1366         key.offset = start;
1367         key.type = BTRFS_DEV_EXTENT_KEY;
1368         ret = btrfs_insert_empty_item(trans, root, path, &key,
1369                                       sizeof(*extent));
1370         if (ret)
1371                 goto out;
1372
1373         leaf = path->nodes[0];
1374         extent = btrfs_item_ptr(leaf, path->slots[0],
1375                                 struct btrfs_dev_extent);
1376         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1377         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1378         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1379
1380         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1381                     btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
1382
1383         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1384         btrfs_mark_buffer_dirty(leaf);
1385 out:
1386         btrfs_free_path(path);
1387         return ret;
1388 }
1389
1390 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1391 {
1392         struct extent_map_tree *em_tree;
1393         struct extent_map *em;
1394         struct rb_node *n;
1395         u64 ret = 0;
1396
1397         em_tree = &fs_info->mapping_tree.map_tree;
1398         read_lock(&em_tree->lock);
1399         n = rb_last(&em_tree->map);
1400         if (n) {
1401                 em = rb_entry(n, struct extent_map, rb_node);
1402                 ret = em->start + em->len;
1403         }
1404         read_unlock(&em_tree->lock);
1405
1406         return ret;
1407 }
1408
1409 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1410                                     u64 *devid_ret)
1411 {
1412         int ret;
1413         struct btrfs_key key;
1414         struct btrfs_key found_key;
1415         struct btrfs_path *path;
1416
1417         path = btrfs_alloc_path();
1418         if (!path)
1419                 return -ENOMEM;
1420
1421         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1422         key.type = BTRFS_DEV_ITEM_KEY;
1423         key.offset = (u64)-1;
1424
1425         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1426         if (ret < 0)
1427                 goto error;
1428
1429         BUG_ON(ret == 0); /* Corruption */
1430
1431         ret = btrfs_previous_item(fs_info->chunk_root, path,
1432                                   BTRFS_DEV_ITEMS_OBJECTID,
1433                                   BTRFS_DEV_ITEM_KEY);
1434         if (ret) {
1435                 *devid_ret = 1;
1436         } else {
1437                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1438                                       path->slots[0]);
1439                 *devid_ret = found_key.offset + 1;
1440         }
1441         ret = 0;
1442 error:
1443         btrfs_free_path(path);
1444         return ret;
1445 }
1446
1447 /*
1448  * the device information is stored in the chunk root
1449  * the btrfs_device struct should be fully filled in
1450  */
1451 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1452                             struct btrfs_root *root,
1453                             struct btrfs_device *device)
1454 {
1455         int ret;
1456         struct btrfs_path *path;
1457         struct btrfs_dev_item *dev_item;
1458         struct extent_buffer *leaf;
1459         struct btrfs_key key;
1460         unsigned long ptr;
1461
1462         root = root->fs_info->chunk_root;
1463
1464         path = btrfs_alloc_path();
1465         if (!path)
1466                 return -ENOMEM;
1467
1468         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1469         key.type = BTRFS_DEV_ITEM_KEY;
1470         key.offset = device->devid;
1471
1472         ret = btrfs_insert_empty_item(trans, root, path, &key,
1473                                       sizeof(*dev_item));
1474         if (ret)
1475                 goto out;
1476
1477         leaf = path->nodes[0];
1478         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1479
1480         btrfs_set_device_id(leaf, dev_item, device->devid);
1481         btrfs_set_device_generation(leaf, dev_item, 0);
1482         btrfs_set_device_type(leaf, dev_item, device->type);
1483         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1484         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1485         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1486         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1487         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1488         btrfs_set_device_group(leaf, dev_item, 0);
1489         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1490         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1491         btrfs_set_device_start_offset(leaf, dev_item, 0);
1492
1493         ptr = btrfs_device_uuid(dev_item);
1494         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1495         ptr = btrfs_device_fsid(dev_item);
1496         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1497         btrfs_mark_buffer_dirty(leaf);
1498
1499         ret = 0;
1500 out:
1501         btrfs_free_path(path);
1502         return ret;
1503 }
1504
1505 /*
1506  * Function to update ctime/mtime for a given device path.
1507  * Mainly used for ctime/mtime based probe like libblkid.
1508  */
1509 static void update_dev_time(char *path_name)
1510 {
1511         struct file *filp;
1512
1513         filp = filp_open(path_name, O_RDWR, 0);
1514         if (!filp)
1515                 return;
1516         file_update_time(filp);
1517         filp_close(filp, NULL);
1518         return;
1519 }
1520
1521 static int btrfs_rm_dev_item(struct btrfs_root *root,
1522                              struct btrfs_device *device)
1523 {
1524         int ret;
1525         struct btrfs_path *path;
1526         struct btrfs_key key;
1527         struct btrfs_trans_handle *trans;
1528
1529         root = root->fs_info->chunk_root;
1530
1531         path = btrfs_alloc_path();
1532         if (!path)
1533                 return -ENOMEM;
1534
1535         trans = btrfs_start_transaction(root, 0);
1536         if (IS_ERR(trans)) {
1537                 btrfs_free_path(path);
1538                 return PTR_ERR(trans);
1539         }
1540         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1541         key.type = BTRFS_DEV_ITEM_KEY;
1542         key.offset = device->devid;
1543         lock_chunks(root);
1544
1545         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1546         if (ret < 0)
1547                 goto out;
1548
1549         if (ret > 0) {
1550                 ret = -ENOENT;
1551                 goto out;
1552         }
1553
1554         ret = btrfs_del_item(trans, root, path);
1555         if (ret)
1556                 goto out;
1557 out:
1558         btrfs_free_path(path);
1559         unlock_chunks(root);
1560         btrfs_commit_transaction(trans, root);
1561         return ret;
1562 }
1563
1564 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1565 {
1566         struct btrfs_device *device;
1567         struct btrfs_device *next_device;
1568         struct block_device *bdev;
1569         struct buffer_head *bh = NULL;
1570         struct btrfs_super_block *disk_super;
1571         struct btrfs_fs_devices *cur_devices;
1572         u64 all_avail;
1573         u64 devid;
1574         u64 num_devices;
1575         u8 *dev_uuid;
1576         unsigned seq;
1577         int ret = 0;
1578         bool clear_super = false;
1579
1580         mutex_lock(&uuid_mutex);
1581
1582         do {
1583                 seq = read_seqbegin(&root->fs_info->profiles_lock);
1584
1585                 all_avail = root->fs_info->avail_data_alloc_bits |
1586                             root->fs_info->avail_system_alloc_bits |
1587                             root->fs_info->avail_metadata_alloc_bits;
1588         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1589
1590         num_devices = root->fs_info->fs_devices->num_devices;
1591         btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1592         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1593                 WARN_ON(num_devices < 1);
1594                 num_devices--;
1595         }
1596         btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1597
1598         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1599                 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1600                 goto out;
1601         }
1602
1603         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1604                 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1605                 goto out;
1606         }
1607
1608         if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1609             root->fs_info->fs_devices->rw_devices <= 2) {
1610                 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1611                 goto out;
1612         }
1613         if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1614             root->fs_info->fs_devices->rw_devices <= 3) {
1615                 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1616                 goto out;
1617         }
1618
1619         if (strcmp(device_path, "missing") == 0) {
1620                 struct list_head *devices;
1621                 struct btrfs_device *tmp;
1622
1623                 device = NULL;
1624                 devices = &root->fs_info->fs_devices->devices;
1625                 /*
1626                  * It is safe to read the devices since the volume_mutex
1627                  * is held.
1628                  */
1629                 list_for_each_entry(tmp, devices, dev_list) {
1630                         if (tmp->in_fs_metadata &&
1631                             !tmp->is_tgtdev_for_dev_replace &&
1632                             !tmp->bdev) {
1633                                 device = tmp;
1634                                 break;
1635                         }
1636                 }
1637                 bdev = NULL;
1638                 bh = NULL;
1639                 disk_super = NULL;
1640                 if (!device) {
1641                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1642                         goto out;
1643                 }
1644         } else {
1645                 ret = btrfs_get_bdev_and_sb(device_path,
1646                                             FMODE_WRITE | FMODE_EXCL,
1647                                             root->fs_info->bdev_holder, 0,
1648                                             &bdev, &bh);
1649                 if (ret)
1650                         goto out;
1651                 disk_super = (struct btrfs_super_block *)bh->b_data;
1652                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1653                 dev_uuid = disk_super->dev_item.uuid;
1654                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1655                                            disk_super->fsid);
1656                 if (!device) {
1657                         ret = -ENOENT;
1658                         goto error_brelse;
1659                 }
1660         }
1661
1662         if (device->is_tgtdev_for_dev_replace) {
1663                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1664                 goto error_brelse;
1665         }
1666
1667         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1668                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1669                 goto error_brelse;
1670         }
1671
1672         if (device->writeable) {
1673                 lock_chunks(root);
1674                 list_del_init(&device->dev_alloc_list);
1675                 unlock_chunks(root);
1676                 root->fs_info->fs_devices->rw_devices--;
1677                 clear_super = true;
1678         }
1679
1680         mutex_unlock(&uuid_mutex);
1681         ret = btrfs_shrink_device(device, 0);
1682         mutex_lock(&uuid_mutex);
1683         if (ret)
1684                 goto error_undo;
1685
1686         /*
1687          * TODO: the superblock still includes this device in its num_devices
1688          * counter although write_all_supers() is not locked out. This
1689          * could give a filesystem state which requires a degraded mount.
1690          */
1691         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1692         if (ret)
1693                 goto error_undo;
1694
1695         spin_lock(&root->fs_info->free_chunk_lock);
1696         root->fs_info->free_chunk_space = device->total_bytes -
1697                 device->bytes_used;
1698         spin_unlock(&root->fs_info->free_chunk_lock);
1699
1700         device->in_fs_metadata = 0;
1701         btrfs_scrub_cancel_dev(root->fs_info, device);
1702
1703         /*
1704          * the device list mutex makes sure that we don't change
1705          * the device list while someone else is writing out all
1706          * the device supers. Whoever is writing all supers, should
1707          * lock the device list mutex before getting the number of
1708          * devices in the super block (super_copy). Conversely,
1709          * whoever updates the number of devices in the super block
1710          * (super_copy) should hold the device list mutex.
1711          */
1712
1713         cur_devices = device->fs_devices;
1714         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1715         list_del_rcu(&device->dev_list);
1716
1717         device->fs_devices->num_devices--;
1718         device->fs_devices->total_devices--;
1719
1720         if (device->missing)
1721                 device->fs_devices->missing_devices--;
1722
1723         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1724                                  struct btrfs_device, dev_list);
1725         if (device->bdev == root->fs_info->sb->s_bdev)
1726                 root->fs_info->sb->s_bdev = next_device->bdev;
1727         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1728                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1729
1730         if (device->bdev) {
1731                 device->fs_devices->open_devices--;
1732                 /* remove sysfs entry */
1733                 btrfs_kobj_rm_device(root->fs_info, device);
1734         }
1735
1736         call_rcu(&device->rcu, free_device);
1737
1738         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1739         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1740         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1741
1742         if (cur_devices->open_devices == 0) {
1743                 struct btrfs_fs_devices *fs_devices;
1744                 fs_devices = root->fs_info->fs_devices;
1745                 while (fs_devices) {
1746                         if (fs_devices->seed == cur_devices) {
1747                                 fs_devices->seed = cur_devices->seed;
1748                                 break;
1749                         }
1750                         fs_devices = fs_devices->seed;
1751                 }
1752                 cur_devices->seed = NULL;
1753                 lock_chunks(root);
1754                 __btrfs_close_devices(cur_devices);
1755                 unlock_chunks(root);
1756                 free_fs_devices(cur_devices);
1757         }
1758
1759         root->fs_info->num_tolerated_disk_barrier_failures =
1760                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1761
1762         /*
1763          * at this point, the device is zero sized.  We want to
1764          * remove it from the devices list and zero out the old super
1765          */
1766         if (clear_super && disk_super) {
1767                 u64 bytenr;
1768                 int i;
1769
1770                 /* make sure this device isn't detected as part of
1771                  * the FS anymore
1772                  */
1773                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1774                 set_buffer_dirty(bh);
1775                 sync_dirty_buffer(bh);
1776
1777                 /* clear the mirror copies of super block on the disk
1778                  * being removed, 0th copy is been taken care above and
1779                  * the below would take of the rest
1780                  */
1781                 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1782                         bytenr = btrfs_sb_offset(i);
1783                         if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1784                                         i_size_read(bdev->bd_inode))
1785                                 break;
1786
1787                         brelse(bh);
1788                         bh = __bread(bdev, bytenr / 4096,
1789                                         BTRFS_SUPER_INFO_SIZE);
1790                         if (!bh)
1791                                 continue;
1792
1793                         disk_super = (struct btrfs_super_block *)bh->b_data;
1794
1795                         if (btrfs_super_bytenr(disk_super) != bytenr ||
1796                                 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1797                                 continue;
1798                         }
1799                         memset(&disk_super->magic, 0,
1800                                                 sizeof(disk_super->magic));
1801                         set_buffer_dirty(bh);
1802                         sync_dirty_buffer(bh);
1803                 }
1804         }
1805
1806         ret = 0;
1807
1808         if (bdev) {
1809                 /* Notify udev that device has changed */
1810                 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1811
1812                 /* Update ctime/mtime for device path for libblkid */
1813                 update_dev_time(device_path);
1814         }
1815
1816 error_brelse:
1817         brelse(bh);
1818         if (bdev)
1819                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1820 out:
1821         mutex_unlock(&uuid_mutex);
1822         return ret;
1823 error_undo:
1824         if (device->writeable) {
1825                 lock_chunks(root);
1826                 list_add(&device->dev_alloc_list,
1827                          &root->fs_info->fs_devices->alloc_list);
1828                 unlock_chunks(root);
1829                 root->fs_info->fs_devices->rw_devices++;
1830         }
1831         goto error_brelse;
1832 }
1833
1834 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1835                                  struct btrfs_device *srcdev)
1836 {
1837         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1838
1839         list_del_rcu(&srcdev->dev_list);
1840         list_del_rcu(&srcdev->dev_alloc_list);
1841         fs_info->fs_devices->num_devices--;
1842         if (srcdev->missing) {
1843                 fs_info->fs_devices->missing_devices--;
1844                 fs_info->fs_devices->rw_devices++;
1845         }
1846         if (srcdev->can_discard)
1847                 fs_info->fs_devices->num_can_discard--;
1848         if (srcdev->bdev) {
1849                 fs_info->fs_devices->open_devices--;
1850
1851                 /* zero out the old super */
1852                 btrfs_scratch_superblock(srcdev);
1853         }
1854
1855         call_rcu(&srcdev->rcu, free_device);
1856 }
1857
1858 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1859                                       struct btrfs_device *tgtdev)
1860 {
1861         struct btrfs_device *next_device;
1862
1863         WARN_ON(!tgtdev);
1864         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1865         if (tgtdev->bdev) {
1866                 btrfs_scratch_superblock(tgtdev);
1867                 fs_info->fs_devices->open_devices--;
1868         }
1869         fs_info->fs_devices->num_devices--;
1870         if (tgtdev->can_discard)
1871                 fs_info->fs_devices->num_can_discard++;
1872
1873         next_device = list_entry(fs_info->fs_devices->devices.next,
1874                                  struct btrfs_device, dev_list);
1875         if (tgtdev->bdev == fs_info->sb->s_bdev)
1876                 fs_info->sb->s_bdev = next_device->bdev;
1877         if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1878                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1879         list_del_rcu(&tgtdev->dev_list);
1880
1881         call_rcu(&tgtdev->rcu, free_device);
1882
1883         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1884 }
1885
1886 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1887                                      struct btrfs_device **device)
1888 {
1889         int ret = 0;
1890         struct btrfs_super_block *disk_super;
1891         u64 devid;
1892         u8 *dev_uuid;
1893         struct block_device *bdev;
1894         struct buffer_head *bh;
1895
1896         *device = NULL;
1897         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1898                                     root->fs_info->bdev_holder, 0, &bdev, &bh);
1899         if (ret)
1900                 return ret;
1901         disk_super = (struct btrfs_super_block *)bh->b_data;
1902         devid = btrfs_stack_device_id(&disk_super->dev_item);
1903         dev_uuid = disk_super->dev_item.uuid;
1904         *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1905                                     disk_super->fsid);
1906         brelse(bh);
1907         if (!*device)
1908                 ret = -ENOENT;
1909         blkdev_put(bdev, FMODE_READ);
1910         return ret;
1911 }
1912
1913 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1914                                          char *device_path,
1915                                          struct btrfs_device **device)
1916 {
1917         *device = NULL;
1918         if (strcmp(device_path, "missing") == 0) {
1919                 struct list_head *devices;
1920                 struct btrfs_device *tmp;
1921
1922                 devices = &root->fs_info->fs_devices->devices;
1923                 /*
1924                  * It is safe to read the devices since the volume_mutex
1925                  * is held by the caller.
1926                  */
1927                 list_for_each_entry(tmp, devices, dev_list) {
1928                         if (tmp->in_fs_metadata && !tmp->bdev) {
1929                                 *device = tmp;
1930                                 break;
1931                         }
1932                 }
1933
1934                 if (!*device) {
1935                         btrfs_err(root->fs_info, "no missing device found");
1936                         return -ENOENT;
1937                 }
1938
1939                 return 0;
1940         } else {
1941                 return btrfs_find_device_by_path(root, device_path, device);
1942         }
1943 }
1944
1945 /*
1946  * does all the dirty work required for changing file system's UUID.
1947  */
1948 static int btrfs_prepare_sprout(struct btrfs_root *root)
1949 {
1950         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1951         struct btrfs_fs_devices *old_devices;
1952         struct btrfs_fs_devices *seed_devices;
1953         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1954         struct btrfs_device *device;
1955         u64 super_flags;
1956
1957         BUG_ON(!mutex_is_locked(&uuid_mutex));
1958         if (!fs_devices->seeding)
1959                 return -EINVAL;
1960
1961         seed_devices = __alloc_fs_devices();
1962         if (IS_ERR(seed_devices))
1963                 return PTR_ERR(seed_devices);
1964
1965         old_devices = clone_fs_devices(fs_devices);
1966         if (IS_ERR(old_devices)) {
1967                 kfree(seed_devices);
1968                 return PTR_ERR(old_devices);
1969         }
1970
1971         list_add(&old_devices->list, &fs_uuids);
1972
1973         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1974         seed_devices->opened = 1;
1975         INIT_LIST_HEAD(&seed_devices->devices);
1976         INIT_LIST_HEAD(&seed_devices->alloc_list);
1977         mutex_init(&seed_devices->device_list_mutex);
1978
1979         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1980         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1981                               synchronize_rcu);
1982
1983         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1984         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1985                 device->fs_devices = seed_devices;
1986         }
1987
1988         fs_devices->seeding = 0;
1989         fs_devices->num_devices = 0;
1990         fs_devices->open_devices = 0;
1991         fs_devices->missing_devices = 0;
1992         fs_devices->num_can_discard = 0;
1993         fs_devices->rotating = 0;
1994         fs_devices->seed = seed_devices;
1995
1996         generate_random_uuid(fs_devices->fsid);
1997         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1998         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1999         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2000
2001         super_flags = btrfs_super_flags(disk_super) &
2002                       ~BTRFS_SUPER_FLAG_SEEDING;
2003         btrfs_set_super_flags(disk_super, super_flags);
2004
2005         return 0;
2006 }
2007
2008 /*
2009  * strore the expected generation for seed devices in device items.
2010  */
2011 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2012                                struct btrfs_root *root)
2013 {
2014         struct btrfs_path *path;
2015         struct extent_buffer *leaf;
2016         struct btrfs_dev_item *dev_item;
2017         struct btrfs_device *device;
2018         struct btrfs_key key;
2019         u8 fs_uuid[BTRFS_UUID_SIZE];
2020         u8 dev_uuid[BTRFS_UUID_SIZE];
2021         u64 devid;
2022         int ret;
2023
2024         path = btrfs_alloc_path();
2025         if (!path)
2026                 return -ENOMEM;
2027
2028         root = root->fs_info->chunk_root;
2029         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2030         key.offset = 0;
2031         key.type = BTRFS_DEV_ITEM_KEY;
2032
2033         while (1) {
2034                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2035                 if (ret < 0)
2036                         goto error;
2037
2038                 leaf = path->nodes[0];
2039 next_slot:
2040                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2041                         ret = btrfs_next_leaf(root, path);
2042                         if (ret > 0)
2043                                 break;
2044                         if (ret < 0)
2045                                 goto error;
2046                         leaf = path->nodes[0];
2047                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2048                         btrfs_release_path(path);
2049                         continue;
2050                 }
2051
2052                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2053                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2054                     key.type != BTRFS_DEV_ITEM_KEY)
2055                         break;
2056
2057                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2058                                           struct btrfs_dev_item);
2059                 devid = btrfs_device_id(leaf, dev_item);
2060                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2061                                    BTRFS_UUID_SIZE);
2062                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2063                                    BTRFS_UUID_SIZE);
2064                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
2065                                            fs_uuid);
2066                 BUG_ON(!device); /* Logic error */
2067
2068                 if (device->fs_devices->seeding) {
2069                         btrfs_set_device_generation(leaf, dev_item,
2070                                                     device->generation);
2071                         btrfs_mark_buffer_dirty(leaf);
2072                 }
2073
2074                 path->slots[0]++;
2075                 goto next_slot;
2076         }
2077         ret = 0;
2078 error:
2079         btrfs_free_path(path);
2080         return ret;
2081 }
2082
2083 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2084 {
2085         struct request_queue *q;
2086         struct btrfs_trans_handle *trans;
2087         struct btrfs_device *device;
2088         struct block_device *bdev;
2089         struct list_head *devices;
2090         struct super_block *sb = root->fs_info->sb;
2091         struct rcu_string *name;
2092         u64 total_bytes;
2093         int seeding_dev = 0;
2094         int ret = 0;
2095
2096         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
2097                 return -EROFS;
2098
2099         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2100                                   root->fs_info->bdev_holder);
2101         if (IS_ERR(bdev))
2102                 return PTR_ERR(bdev);
2103
2104         if (root->fs_info->fs_devices->seeding) {
2105                 seeding_dev = 1;
2106                 down_write(&sb->s_umount);
2107                 mutex_lock(&uuid_mutex);
2108         }
2109
2110         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2111
2112         devices = &root->fs_info->fs_devices->devices;
2113
2114         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2115         list_for_each_entry(device, devices, dev_list) {
2116                 if (device->bdev == bdev) {
2117                         ret = -EEXIST;
2118                         mutex_unlock(
2119                                 &root->fs_info->fs_devices->device_list_mutex);
2120                         goto error;
2121                 }
2122         }
2123         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2124
2125         device = btrfs_alloc_device(root->fs_info, NULL, NULL);
2126         if (IS_ERR(device)) {
2127                 /* we can safely leave the fs_devices entry around */
2128                 ret = PTR_ERR(device);
2129                 goto error;
2130         }
2131
2132         name = rcu_string_strdup(device_path, GFP_NOFS);
2133         if (!name) {
2134                 kfree(device);
2135                 ret = -ENOMEM;
2136                 goto error;
2137         }
2138         rcu_assign_pointer(device->name, name);
2139
2140         trans = btrfs_start_transaction(root, 0);
2141         if (IS_ERR(trans)) {
2142                 rcu_string_free(device->name);
2143                 kfree(device);
2144                 ret = PTR_ERR(trans);
2145                 goto error;
2146         }
2147
2148         lock_chunks(root);
2149
2150         q = bdev_get_queue(bdev);
2151         if (blk_queue_discard(q))
2152                 device->can_discard = 1;
2153         device->writeable = 1;
2154         device->generation = trans->transid;
2155         device->io_width = root->sectorsize;
2156         device->io_align = root->sectorsize;
2157         device->sector_size = root->sectorsize;
2158         device->total_bytes = i_size_read(bdev->bd_inode);
2159         device->disk_total_bytes = device->total_bytes;
2160         device->dev_root = root->fs_info->dev_root;
2161         device->bdev = bdev;
2162         device->in_fs_metadata = 1;
2163         device->is_tgtdev_for_dev_replace = 0;
2164         device->mode = FMODE_EXCL;
2165         device->dev_stats_valid = 1;
2166         set_blocksize(device->bdev, 4096);
2167
2168         if (seeding_dev) {
2169                 sb->s_flags &= ~MS_RDONLY;
2170                 ret = btrfs_prepare_sprout(root);
2171                 BUG_ON(ret); /* -ENOMEM */
2172         }
2173
2174         device->fs_devices = root->fs_info->fs_devices;
2175
2176         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2177         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2178         list_add(&device->dev_alloc_list,
2179                  &root->fs_info->fs_devices->alloc_list);
2180         root->fs_info->fs_devices->num_devices++;
2181         root->fs_info->fs_devices->open_devices++;
2182         root->fs_info->fs_devices->rw_devices++;
2183         root->fs_info->fs_devices->total_devices++;
2184         if (device->can_discard)
2185                 root->fs_info->fs_devices->num_can_discard++;
2186         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2187
2188         spin_lock(&root->fs_info->free_chunk_lock);
2189         root->fs_info->free_chunk_space += device->total_bytes;
2190         spin_unlock(&root->fs_info->free_chunk_lock);
2191
2192         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2193                 root->fs_info->fs_devices->rotating = 1;
2194
2195         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
2196         btrfs_set_super_total_bytes(root->fs_info->super_copy,
2197                                     total_bytes + device->total_bytes);
2198
2199         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
2200         btrfs_set_super_num_devices(root->fs_info->super_copy,
2201                                     total_bytes + 1);
2202
2203         /* add sysfs device entry */
2204         btrfs_kobj_add_device(root->fs_info, device);
2205
2206         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2207
2208         if (seeding_dev) {
2209                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2210                 ret = init_first_rw_device(trans, root, device);
2211                 if (ret) {
2212                         btrfs_abort_transaction(trans, root, ret);
2213                         goto error_trans;
2214                 }
2215                 ret = btrfs_finish_sprout(trans, root);
2216                 if (ret) {
2217                         btrfs_abort_transaction(trans, root, ret);
2218                         goto error_trans;
2219                 }
2220
2221                 /* Sprouting would change fsid of the mounted root,
2222                  * so rename the fsid on the sysfs
2223                  */
2224                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2225                                                 root->fs_info->fsid);
2226                 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2227                         goto error_trans;
2228         } else {
2229                 ret = btrfs_add_device(trans, root, device);
2230                 if (ret) {
2231                         btrfs_abort_transaction(trans, root, ret);
2232                         goto error_trans;
2233                 }
2234         }
2235
2236         /*
2237          * we've got more storage, clear any full flags on the space
2238          * infos
2239          */
2240         btrfs_clear_space_info_full(root->fs_info);
2241
2242         unlock_chunks(root);
2243         root->fs_info->num_tolerated_disk_barrier_failures =
2244                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2245         ret = btrfs_commit_transaction(trans, root);
2246
2247         if (seeding_dev) {
2248                 mutex_unlock(&uuid_mutex);
2249                 up_write(&sb->s_umount);
2250
2251                 if (ret) /* transaction commit */
2252                         return ret;
2253
2254                 ret = btrfs_relocate_sys_chunks(root);
2255                 if (ret < 0)
2256                         btrfs_error(root->fs_info, ret,
2257                                     "Failed to relocate sys chunks after "
2258                                     "device initialization. This can be fixed "
2259                                     "using the \"btrfs balance\" command.");
2260                 trans = btrfs_attach_transaction(root);
2261                 if (IS_ERR(trans)) {
2262                         if (PTR_ERR(trans) == -ENOENT)
2263                                 return 0;
2264                         return PTR_ERR(trans);
2265                 }
2266                 ret = btrfs_commit_transaction(trans, root);
2267         }
2268
2269         /* Update ctime/mtime for libblkid */
2270         update_dev_time(device_path);
2271         return ret;
2272
2273 error_trans:
2274         unlock_chunks(root);
2275         btrfs_end_transaction(trans, root);
2276         rcu_string_free(device->name);
2277         btrfs_kobj_rm_device(root->fs_info, device);
2278         kfree(device);
2279 error:
2280         blkdev_put(bdev, FMODE_EXCL);
2281         if (seeding_dev) {
2282                 mutex_unlock(&uuid_mutex);
2283                 up_write(&sb->s_umount);
2284         }
2285         return ret;
2286 }
2287
2288 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2289                                   struct btrfs_device **device_out)
2290 {
2291         struct request_queue *q;
2292         struct btrfs_device *device;
2293         struct block_device *bdev;
2294         struct btrfs_fs_info *fs_info = root->fs_info;
2295         struct list_head *devices;
2296         struct rcu_string *name;
2297         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2298         int ret = 0;
2299
2300         *device_out = NULL;
2301         if (fs_info->fs_devices->seeding)
2302                 return -EINVAL;
2303
2304         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2305                                   fs_info->bdev_holder);
2306         if (IS_ERR(bdev))
2307                 return PTR_ERR(bdev);
2308
2309         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2310
2311         devices = &fs_info->fs_devices->devices;
2312         list_for_each_entry(device, devices, dev_list) {
2313                 if (device->bdev == bdev) {
2314                         ret = -EEXIST;
2315                         goto error;
2316                 }
2317         }
2318
2319         device = btrfs_alloc_device(NULL, &devid, NULL);
2320         if (IS_ERR(device)) {
2321                 ret = PTR_ERR(device);
2322                 goto error;
2323         }
2324
2325         name = rcu_string_strdup(device_path, GFP_NOFS);
2326         if (!name) {
2327                 kfree(device);
2328                 ret = -ENOMEM;
2329                 goto error;
2330         }
2331         rcu_assign_pointer(device->name, name);
2332
2333         q = bdev_get_queue(bdev);
2334         if (blk_queue_discard(q))
2335                 device->can_discard = 1;
2336         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2337         device->writeable = 1;
2338         device->generation = 0;
2339         device->io_width = root->sectorsize;
2340         device->io_align = root->sectorsize;
2341         device->sector_size = root->sectorsize;
2342         device->total_bytes = i_size_read(bdev->bd_inode);
2343         device->disk_total_bytes = device->total_bytes;
2344         device->dev_root = fs_info->dev_root;
2345         device->bdev = bdev;
2346         device->in_fs_metadata = 1;
2347         device->is_tgtdev_for_dev_replace = 1;
2348         device->mode = FMODE_EXCL;
2349         device->dev_stats_valid = 1;
2350         set_blocksize(device->bdev, 4096);
2351         device->fs_devices = fs_info->fs_devices;
2352         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2353         fs_info->fs_devices->num_devices++;
2354         fs_info->fs_devices->open_devices++;
2355         if (device->can_discard)
2356                 fs_info->fs_devices->num_can_discard++;
2357         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2358
2359         *device_out = device;
2360         return ret;
2361
2362 error:
2363         blkdev_put(bdev, FMODE_EXCL);
2364         return ret;
2365 }
2366
2367 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2368                                               struct btrfs_device *tgtdev)
2369 {
2370         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2371         tgtdev->io_width = fs_info->dev_root->sectorsize;
2372         tgtdev->io_align = fs_info->dev_root->sectorsize;
2373         tgtdev->sector_size = fs_info->dev_root->sectorsize;
2374         tgtdev->dev_root = fs_info->dev_root;
2375         tgtdev->in_fs_metadata = 1;
2376 }
2377
2378 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2379                                         struct btrfs_device *device)
2380 {
2381         int ret;
2382         struct btrfs_path *path;
2383         struct btrfs_root *root;
2384         struct btrfs_dev_item *dev_item;
2385         struct extent_buffer *leaf;
2386         struct btrfs_key key;
2387
2388         root = device->dev_root->fs_info->chunk_root;
2389
2390         path = btrfs_alloc_path();
2391         if (!path)
2392                 return -ENOMEM;
2393
2394         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2395         key.type = BTRFS_DEV_ITEM_KEY;
2396         key.offset = device->devid;
2397
2398         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2399         if (ret < 0)
2400                 goto out;
2401
2402         if (ret > 0) {
2403                 ret = -ENOENT;
2404                 goto out;
2405         }
2406
2407         leaf = path->nodes[0];
2408         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2409
2410         btrfs_set_device_id(leaf, dev_item, device->devid);
2411         btrfs_set_device_type(leaf, dev_item, device->type);
2412         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2413         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2414         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2415         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
2416         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
2417         btrfs_mark_buffer_dirty(leaf);
2418
2419 out:
2420         btrfs_free_path(path);
2421         return ret;
2422 }
2423
2424 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
2425                       struct btrfs_device *device, u64 new_size)
2426 {
2427         struct btrfs_super_block *super_copy =
2428                 device->dev_root->fs_info->super_copy;
2429         u64 old_total = btrfs_super_total_bytes(super_copy);
2430         u64 diff = new_size - device->total_bytes;
2431
2432         if (!device->writeable)
2433                 return -EACCES;
2434         if (new_size <= device->total_bytes ||
2435             device->is_tgtdev_for_dev_replace)
2436                 return -EINVAL;
2437
2438         btrfs_set_super_total_bytes(super_copy, old_total + diff);
2439         device->fs_devices->total_rw_bytes += diff;
2440
2441         device->total_bytes = new_size;
2442         device->disk_total_bytes = new_size;
2443         btrfs_clear_space_info_full(device->dev_root->fs_info);
2444
2445         return btrfs_update_device(trans, device);
2446 }
2447
2448 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2449                       struct btrfs_device *device, u64 new_size)
2450 {
2451         int ret;
2452         lock_chunks(device->dev_root);
2453         ret = __btrfs_grow_device(trans, device, new_size);
2454         unlock_chunks(device->dev_root);
2455         return ret;
2456 }
2457
2458 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2459                             struct btrfs_root *root,
2460                             u64 chunk_tree, u64 chunk_objectid,
2461                             u64 chunk_offset)
2462 {
2463         int ret;
2464         struct btrfs_path *path;
2465         struct btrfs_key key;
2466
2467         root = root->fs_info->chunk_root;
2468         path = btrfs_alloc_path();
2469         if (!path)
2470                 return -ENOMEM;
2471
2472         key.objectid = chunk_objectid;
2473         key.offset = chunk_offset;
2474         key.type = BTRFS_CHUNK_ITEM_KEY;
2475
2476         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2477         if (ret < 0)
2478                 goto out;
2479         else if (ret > 0) { /* Logic error or corruption */
2480                 btrfs_error(root->fs_info, -ENOENT,
2481                             "Failed lookup while freeing chunk.");
2482                 ret = -ENOENT;
2483                 goto out;
2484         }
2485
2486         ret = btrfs_del_item(trans, root, path);
2487         if (ret < 0)
2488                 btrfs_error(root->fs_info, ret,
2489                             "Failed to delete chunk item.");
2490 out:
2491         btrfs_free_path(path);
2492         return ret;
2493 }
2494
2495 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2496                         chunk_offset)
2497 {
2498         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2499         struct btrfs_disk_key *disk_key;
2500         struct btrfs_chunk *chunk;
2501         u8 *ptr;
2502         int ret = 0;
2503         u32 num_stripes;
2504         u32 array_size;
2505         u32 len = 0;
2506         u32 cur;
2507         struct btrfs_key key;
2508
2509         array_size = btrfs_super_sys_array_size(super_copy);
2510
2511         ptr = super_copy->sys_chunk_array;
2512         cur = 0;
2513
2514         while (cur < array_size) {
2515                 disk_key = (struct btrfs_disk_key *)ptr;
2516                 btrfs_disk_key_to_cpu(&key, disk_key);
2517
2518                 len = sizeof(*disk_key);
2519
2520                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2521                         chunk = (struct btrfs_chunk *)(ptr + len);
2522                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2523                         len += btrfs_chunk_item_size(num_stripes);
2524                 } else {
2525                         ret = -EIO;
2526                         break;
2527                 }
2528                 if (key.objectid == chunk_objectid &&
2529                     key.offset == chunk_offset) {
2530                         memmove(ptr, ptr + len, array_size - (cur + len));
2531                         array_size -= len;
2532                         btrfs_set_super_sys_array_size(super_copy, array_size);
2533                 } else {
2534                         ptr += len;
2535                         cur += len;
2536                 }
2537         }
2538         return ret;
2539 }
2540
2541 static int btrfs_relocate_chunk(struct btrfs_root *root,
2542                          u64 chunk_tree, u64 chunk_objectid,
2543                          u64 chunk_offset)
2544 {
2545         struct extent_map_tree *em_tree;
2546         struct btrfs_root *extent_root;
2547         struct btrfs_trans_handle *trans;
2548         struct extent_map *em;
2549         struct map_lookup *map;
2550         int ret;
2551         int i;
2552
2553         root = root->fs_info->chunk_root;
2554         extent_root = root->fs_info->extent_root;
2555         em_tree = &root->fs_info->mapping_tree.map_tree;
2556
2557         ret = btrfs_can_relocate(extent_root, chunk_offset);
2558         if (ret)
2559                 return -ENOSPC;
2560
2561         /* step one, relocate all the extents inside this chunk */
2562         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2563         if (ret)
2564                 return ret;
2565
2566         trans = btrfs_start_transaction(root, 0);
2567         if (IS_ERR(trans)) {
2568                 ret = PTR_ERR(trans);
2569                 btrfs_std_error(root->fs_info, ret);
2570                 return ret;
2571         }
2572
2573         lock_chunks(root);
2574
2575         /*
2576          * step two, delete the device extents and the
2577          * chunk tree entries
2578          */
2579         read_lock(&em_tree->lock);
2580         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2581         read_unlock(&em_tree->lock);
2582
2583         BUG_ON(!em || em->start > chunk_offset ||
2584                em->start + em->len < chunk_offset);
2585         map = (struct map_lookup *)em->bdev;
2586
2587         for (i = 0; i < map->num_stripes; i++) {
2588                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2589                                             map->stripes[i].physical);
2590                 BUG_ON(ret);
2591
2592                 if (map->stripes[i].dev) {
2593                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2594                         BUG_ON(ret);
2595                 }
2596         }
2597         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2598                                chunk_offset);
2599
2600         BUG_ON(ret);
2601
2602         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2603
2604         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2605                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2606                 BUG_ON(ret);
2607         }
2608
2609         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2610         BUG_ON(ret);
2611
2612         write_lock(&em_tree->lock);
2613         remove_extent_mapping(em_tree, em);
2614         write_unlock(&em_tree->lock);
2615
2616         /* once for the tree */
2617         free_extent_map(em);
2618         /* once for us */
2619         free_extent_map(em);
2620
2621         unlock_chunks(root);
2622         btrfs_end_transaction(trans, root);
2623         return 0;
2624 }
2625
2626 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2627 {
2628         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2629         struct btrfs_path *path;
2630         struct extent_buffer *leaf;
2631         struct btrfs_chunk *chunk;
2632         struct btrfs_key key;
2633         struct btrfs_key found_key;
2634         u64 chunk_tree = chunk_root->root_key.objectid;
2635         u64 chunk_type;
2636         bool retried = false;
2637         int failed = 0;
2638         int ret;
2639
2640         path = btrfs_alloc_path();
2641         if (!path)
2642                 return -ENOMEM;
2643
2644 again:
2645         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2646         key.offset = (u64)-1;
2647         key.type = BTRFS_CHUNK_ITEM_KEY;
2648
2649         while (1) {
2650                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2651                 if (ret < 0)
2652                         goto error;
2653                 BUG_ON(ret == 0); /* Corruption */
2654
2655                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2656                                           key.type);
2657                 if (ret < 0)
2658                         goto error;
2659                 if (ret > 0)
2660                         break;
2661
2662                 leaf = path->nodes[0];
2663                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2664
2665                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2666                                        struct btrfs_chunk);
2667                 chunk_type = btrfs_chunk_type(leaf, chunk);
2668                 btrfs_release_path(path);
2669
2670                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2671                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2672                                                    found_key.objectid,
2673                                                    found_key.offset);
2674                         if (ret == -ENOSPC)
2675                                 failed++;
2676                         else if (ret)
2677                                 BUG();
2678                 }
2679
2680                 if (found_key.offset == 0)
2681                         break;
2682                 key.offset = found_key.offset - 1;
2683         }
2684         ret = 0;
2685         if (failed && !retried) {
2686                 failed = 0;
2687                 retried = true;
2688                 goto again;
2689         } else if (WARN_ON(failed && retried)) {
2690                 ret = -ENOSPC;
2691         }
2692 error:
2693         btrfs_free_path(path);
2694         return ret;
2695 }
2696
2697 static int insert_balance_item(struct btrfs_root *root,
2698                                struct btrfs_balance_control *bctl)
2699 {
2700         struct btrfs_trans_handle *trans;
2701         struct btrfs_balance_item *item;
2702         struct btrfs_disk_balance_args disk_bargs;
2703         struct btrfs_path *path;
2704         struct extent_buffer *leaf;
2705         struct btrfs_key key;
2706         int ret, err;
2707
2708         path = btrfs_alloc_path();
2709         if (!path)
2710                 return -ENOMEM;
2711
2712         trans = btrfs_start_transaction(root, 0);
2713         if (IS_ERR(trans)) {
2714                 btrfs_free_path(path);
2715                 return PTR_ERR(trans);
2716         }
2717
2718         key.objectid = BTRFS_BALANCE_OBJECTID;
2719         key.type = BTRFS_BALANCE_ITEM_KEY;
2720         key.offset = 0;
2721
2722         ret = btrfs_insert_empty_item(trans, root, path, &key,
2723                                       sizeof(*item));
2724         if (ret)
2725                 goto out;
2726
2727         leaf = path->nodes[0];
2728         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2729
2730         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2731
2732         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2733         btrfs_set_balance_data(leaf, item, &disk_bargs);
2734         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2735         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2736         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2737         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2738
2739         btrfs_set_balance_flags(leaf, item, bctl->flags);
2740
2741         btrfs_mark_buffer_dirty(leaf);
2742 out:
2743         btrfs_free_path(path);
2744         err = btrfs_commit_transaction(trans, root);
2745         if (err && !ret)
2746                 ret = err;
2747         return ret;
2748 }
2749
2750 static int del_balance_item(struct btrfs_root *root)
2751 {
2752         struct btrfs_trans_handle *trans;
2753         struct btrfs_path *path;
2754         struct btrfs_key key;
2755         int ret, err;
2756
2757         path = btrfs_alloc_path();
2758         if (!path)
2759                 return -ENOMEM;
2760
2761         trans = btrfs_start_transaction(root, 0);
2762         if (IS_ERR(trans)) {
2763                 btrfs_free_path(path);
2764                 return PTR_ERR(trans);
2765         }
2766
2767         key.objectid = BTRFS_BALANCE_OBJECTID;
2768         key.type = BTRFS_BALANCE_ITEM_KEY;
2769         key.offset = 0;
2770
2771         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2772         if (ret < 0)
2773                 goto out;
2774         if (ret > 0) {
2775                 ret = -ENOENT;
2776                 goto out;
2777         }
2778
2779         ret = btrfs_del_item(trans, root, path);
2780 out:
2781         btrfs_free_path(path);
2782         err = btrfs_commit_transaction(trans, root);
2783         if (err && !ret)
2784                 ret = err;
2785         return ret;
2786 }
2787
2788 /*
2789  * This is a heuristic used to reduce the number of chunks balanced on
2790  * resume after balance was interrupted.
2791  */
2792 static void update_balance_args(struct btrfs_balance_control *bctl)
2793 {
2794         /*
2795          * Turn on soft mode for chunk types that were being converted.
2796          */
2797         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2798                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2799         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2800                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2801         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2802                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2803
2804         /*
2805          * Turn on usage filter if is not already used.  The idea is
2806          * that chunks that we have already balanced should be
2807          * reasonably full.  Don't do it for chunks that are being
2808          * converted - that will keep us from relocating unconverted
2809          * (albeit full) chunks.
2810          */
2811         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2812             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2813                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2814                 bctl->data.usage = 90;
2815         }
2816         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2817             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2818                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2819                 bctl->sys.usage = 90;
2820         }
2821         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2822             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2823                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2824                 bctl->meta.usage = 90;
2825         }
2826 }
2827
2828 /*
2829  * Should be called with both balance and volume mutexes held to
2830  * serialize other volume operations (add_dev/rm_dev/resize) with
2831  * restriper.  Same goes for unset_balance_control.
2832  */
2833 static void set_balance_control(struct btrfs_balance_control *bctl)
2834 {
2835         struct btrfs_fs_info *fs_info = bctl->fs_info;
2836
2837         BUG_ON(fs_info->balance_ctl);
2838
2839         spin_lock(&fs_info->balance_lock);
2840         fs_info->balance_ctl = bctl;
2841         spin_unlock(&fs_info->balance_lock);
2842 }
2843
2844 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2845 {
2846         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2847
2848         BUG_ON(!fs_info->balance_ctl);
2849
2850         spin_lock(&fs_info->balance_lock);
2851         fs_info->balance_ctl = NULL;
2852         spin_unlock(&fs_info->balance_lock);
2853
2854         kfree(bctl);
2855 }
2856
2857 /*
2858  * Balance filters.  Return 1 if chunk should be filtered out
2859  * (should not be balanced).
2860  */
2861 static int chunk_profiles_filter(u64 chunk_type,
2862                                  struct btrfs_balance_args *bargs)
2863 {
2864         chunk_type = chunk_to_extended(chunk_type) &
2865                                 BTRFS_EXTENDED_PROFILE_MASK;
2866
2867         if (bargs->profiles & chunk_type)
2868                 return 0;
2869
2870         return 1;
2871 }
2872
2873 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2874                               struct btrfs_balance_args *bargs)
2875 {
2876         struct btrfs_block_group_cache *cache;
2877         u64 chunk_used, user_thresh;
2878         int ret = 1;
2879
2880         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2881         chunk_used = btrfs_block_group_used(&cache->item);
2882
2883         if (bargs->usage == 0)
2884                 user_thresh = 1;
2885         else if (bargs->usage > 100)
2886                 user_thresh = cache->key.offset;
2887         else
2888                 user_thresh = div_factor_fine(cache->key.offset,
2889                                               bargs->usage);
2890
2891         if (chunk_used < user_thresh)
2892                 ret = 0;
2893
2894         btrfs_put_block_group(cache);
2895         return ret;
2896 }
2897
2898 static int chunk_devid_filter(struct extent_buffer *leaf,
2899                               struct btrfs_chunk *chunk,
2900                               struct btrfs_balance_args *bargs)
2901 {
2902         struct btrfs_stripe *stripe;
2903         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2904         int i;
2905
2906         for (i = 0; i < num_stripes; i++) {
2907                 stripe = btrfs_stripe_nr(chunk, i);
2908                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2909                         return 0;
2910         }
2911
2912         return 1;
2913 }
2914
2915 /* [pstart, pend) */
2916 static int chunk_drange_filter(struct extent_buffer *leaf,
2917                                struct btrfs_chunk *chunk,
2918                                u64 chunk_offset,
2919                                struct btrfs_balance_args *bargs)
2920 {
2921         struct btrfs_stripe *stripe;
2922         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2923         u64 stripe_offset;
2924         u64 stripe_length;
2925         int factor;
2926         int i;
2927
2928         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2929                 return 0;
2930
2931         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2932              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2933                 factor = num_stripes / 2;
2934         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2935                 factor = num_stripes - 1;
2936         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2937                 factor = num_stripes - 2;
2938         } else {
2939                 factor = num_stripes;
2940         }
2941
2942         for (i = 0; i < num_stripes; i++) {
2943                 stripe = btrfs_stripe_nr(chunk, i);
2944                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2945                         continue;
2946
2947                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2948                 stripe_length = btrfs_chunk_length(leaf, chunk);
2949                 do_div(stripe_length, factor);
2950
2951                 if (stripe_offset < bargs->pend &&
2952                     stripe_offset + stripe_length > bargs->pstart)
2953                         return 0;
2954         }
2955
2956         return 1;
2957 }
2958
2959 /* [vstart, vend) */
2960 static int chunk_vrange_filter(struct extent_buffer *leaf,
2961                                struct btrfs_chunk *chunk,
2962                                u64 chunk_offset,
2963                                struct btrfs_balance_args *bargs)
2964 {
2965         if (chunk_offset < bargs->vend &&
2966             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2967                 /* at least part of the chunk is inside this vrange */
2968                 return 0;
2969
2970         return 1;
2971 }
2972
2973 static int chunk_soft_convert_filter(u64 chunk_type,
2974                                      struct btrfs_balance_args *bargs)
2975 {
2976         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2977                 return 0;
2978
2979         chunk_type = chunk_to_extended(chunk_type) &
2980                                 BTRFS_EXTENDED_PROFILE_MASK;
2981
2982         if (bargs->target == chunk_type)
2983                 return 1;
2984
2985         return 0;
2986 }
2987
2988 static int should_balance_chunk(struct btrfs_root *root,
2989                                 struct extent_buffer *leaf,
2990                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2991 {
2992         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2993         struct btrfs_balance_args *bargs = NULL;
2994         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2995
2996         /* type filter */
2997         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2998               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2999                 return 0;
3000         }
3001
3002         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3003                 bargs = &bctl->data;
3004         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3005                 bargs = &bctl->sys;
3006         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3007                 bargs = &bctl->meta;
3008
3009         /* profiles filter */
3010         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3011             chunk_profiles_filter(chunk_type, bargs)) {
3012                 return 0;
3013         }
3014
3015         /* usage filter */
3016         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3017             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3018                 return 0;
3019         }
3020
3021         /* devid filter */
3022         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3023             chunk_devid_filter(leaf, chunk, bargs)) {
3024                 return 0;
3025         }
3026
3027         /* drange filter, makes sense only with devid filter */
3028         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3029             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3030                 return 0;
3031         }
3032
3033         /* vrange filter */
3034         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3035             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3036                 return 0;
3037         }
3038
3039         /* soft profile changing mode */
3040         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3041             chunk_soft_convert_filter(chunk_type, bargs)) {
3042                 return 0;
3043         }
3044
3045         /*
3046          * limited by count, must be the last filter
3047          */
3048         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3049                 if (bargs->limit == 0)
3050                         return 0;
3051                 else
3052                         bargs->limit--;
3053         }
3054
3055         return 1;
3056 }
3057
3058 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3059 {
3060         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3061         struct btrfs_root *chunk_root = fs_info->chunk_root;
3062         struct btrfs_root *dev_root = fs_info->dev_root;
3063         struct list_head *devices;
3064         struct btrfs_device *device;
3065         u64 old_size;
3066         u64 size_to_free;
3067         struct btrfs_chunk *chunk;
3068         struct btrfs_path *path;
3069         struct btrfs_key key;
3070         struct btrfs_key found_key;
3071         struct btrfs_trans_handle *trans;
3072         struct extent_buffer *leaf;
3073         int slot;
3074         int ret;
3075         int enospc_errors = 0;
3076         bool counting = true;
3077         u64 limit_data = bctl->data.limit;
3078         u64 limit_meta = bctl->meta.limit;
3079         u64 limit_sys = bctl->sys.limit;
3080
3081         /* step one make some room on all the devices */
3082         devices = &fs_info->fs_devices->devices;
3083         list_for_each_entry(device, devices, dev_list) {
3084                 old_size = device->total_bytes;
3085                 size_to_free = div_factor(old_size, 1);
3086                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3087                 if (!device->writeable ||
3088                     device->total_bytes - device->bytes_used > size_to_free ||
3089                     device->is_tgtdev_for_dev_replace)
3090                         continue;
3091
3092                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3093                 if (ret == -ENOSPC)
3094                         break;
3095                 BUG_ON(ret);
3096
3097                 trans = btrfs_start_transaction(dev_root, 0);
3098                 BUG_ON(IS_ERR(trans));
3099
3100                 ret = btrfs_grow_device(trans, device, old_size);
3101                 BUG_ON(ret);
3102
3103                 btrfs_end_transaction(trans, dev_root);
3104         }
3105
3106         /* step two, relocate all the chunks */
3107         path = btrfs_alloc_path();
3108         if (!path) {
3109                 ret = -ENOMEM;
3110                 goto error;
3111         }
3112
3113         /* zero out stat counters */
3114         spin_lock(&fs_info->balance_lock);
3115         memset(&bctl->stat, 0, sizeof(bctl->stat));
3116         spin_unlock(&fs_info->balance_lock);
3117 again:
3118         if (!counting) {
3119                 bctl->data.limit = limit_data;
3120                 bctl->meta.limit = limit_meta;
3121                 bctl->sys.limit = limit_sys;
3122         }
3123         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3124         key.offset = (u64)-1;
3125         key.type = BTRFS_CHUNK_ITEM_KEY;
3126
3127         while (1) {
3128                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3129                     atomic_read(&fs_info->balance_cancel_req)) {
3130                         ret = -ECANCELED;
3131                         goto error;
3132                 }
3133
3134                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3135                 if (ret < 0)
3136                         goto error;
3137
3138                 /*
3139                  * this shouldn't happen, it means the last relocate
3140                  * failed
3141                  */
3142                 if (ret == 0)
3143                         BUG(); /* FIXME break ? */
3144
3145                 ret = btrfs_previous_item(chunk_root, path, 0,
3146                                           BTRFS_CHUNK_ITEM_KEY);
3147                 if (ret) {
3148                         ret = 0;
3149                         break;
3150                 }
3151
3152                 leaf = path->nodes[0];
3153                 slot = path->slots[0];
3154                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3155
3156                 if (found_key.objectid != key.objectid)
3157                         break;
3158
3159                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3160
3161                 if (!counting) {
3162                         spin_lock(&fs_info->balance_lock);
3163                         bctl->stat.considered++;
3164                         spin_unlock(&fs_info->balance_lock);
3165                 }
3166
3167                 ret = should_balance_chunk(chunk_root, leaf, chunk,
3168                                            found_key.offset);
3169                 btrfs_release_path(path);
3170                 if (!ret)
3171                         goto loop;
3172
3173                 if (counting) {
3174                         spin_lock(&fs_info->balance_lock);
3175                         bctl->stat.expected++;
3176                         spin_unlock(&fs_info->balance_lock);
3177                         goto loop;
3178                 }
3179
3180                 ret = btrfs_relocate_chunk(chunk_root,
3181                                            chunk_root->root_key.objectid,
3182                                            found_key.objectid,
3183                                            found_key.offset);
3184                 if (ret && ret != -ENOSPC)
3185                         goto error;
3186                 if (ret == -ENOSPC) {
3187                         enospc_errors++;
3188                 } else {
3189                         spin_lock(&fs_info->balance_lock);
3190                         bctl->stat.completed++;
3191                         spin_unlock(&fs_info->balance_lock);
3192                 }
3193 loop:
3194                 if (found_key.offset == 0)
3195                         break;
3196                 key.offset = found_key.offset - 1;
3197         }
3198
3199         if (counting) {
3200                 btrfs_release_path(path);
3201                 counting = false;
3202                 goto again;
3203         }
3204 error:
3205         btrfs_free_path(path);
3206         if (enospc_errors) {
3207                 btrfs_info(fs_info, "%d enospc errors during balance",
3208                        enospc_errors);
3209                 if (!ret)
3210                         ret = -ENOSPC;
3211         }
3212
3213         return ret;
3214 }
3215
3216 /**
3217  * alloc_profile_is_valid - see if a given profile is valid and reduced
3218  * @flags: profile to validate
3219  * @extended: if true @flags is treated as an extended profile
3220  */
3221 static int alloc_profile_is_valid(u64 flags, int extended)
3222 {
3223         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3224                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3225
3226         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3227
3228         /* 1) check that all other bits are zeroed */
3229         if (flags & ~mask)
3230                 return 0;
3231
3232         /* 2) see if profile is reduced */
3233         if (flags == 0)
3234                 return !extended; /* "0" is valid for usual profiles */
3235
3236         /* true if exactly one bit set */
3237         return (flags & (flags - 1)) == 0;
3238 }
3239
3240 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3241 {
3242         /* cancel requested || normal exit path */
3243         return atomic_read(&fs_info->balance_cancel_req) ||
3244                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3245                  atomic_read(&fs_info->balance_cancel_req) == 0);
3246 }
3247
3248 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3249 {
3250         int ret;
3251
3252         unset_balance_control(fs_info);
3253         ret = del_balance_item(fs_info->tree_root);
3254         if (ret)
3255                 btrfs_std_error(fs_info, ret);
3256
3257         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3258 }
3259
3260 /*
3261  * Should be called with both balance and volume mutexes held
3262  */
3263 int btrfs_balance(struct btrfs_balance_control *bctl,
3264                   struct btrfs_ioctl_balance_args *bargs)
3265 {
3266         struct btrfs_fs_info *fs_info = bctl->fs_info;
3267         u64 allowed;
3268         int mixed = 0;
3269         int ret;
3270         u64 num_devices;
3271         unsigned seq;
3272
3273         if (btrfs_fs_closing(fs_info) ||
3274             atomic_read(&fs_info->balance_pause_req) ||
3275             atomic_read(&fs_info->balance_cancel_req)) {
3276                 ret = -EINVAL;
3277                 goto out;
3278         }
3279
3280         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3281         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3282                 mixed = 1;
3283
3284         /*
3285          * In case of mixed groups both data and meta should be picked,
3286          * and identical options should be given for both of them.
3287          */
3288         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3289         if (mixed && (bctl->flags & allowed)) {
3290                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3291                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3292                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3293                         btrfs_err(fs_info, "with mixed groups data and "
3294                                    "metadata balance options must be the same");
3295                         ret = -EINVAL;
3296                         goto out;
3297                 }
3298         }
3299
3300         num_devices = fs_info->fs_devices->num_devices;
3301         btrfs_dev_replace_lock(&fs_info->dev_replace);
3302         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3303                 BUG_ON(num_devices < 1);
3304                 num_devices--;
3305         }
3306         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3307         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3308         if (num_devices == 1)
3309                 allowed |= BTRFS_BLOCK_GROUP_DUP;
3310         else if (num_devices > 1)
3311                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3312         if (num_devices > 2)
3313                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3314         if (num_devices > 3)
3315                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3316                             BTRFS_BLOCK_GROUP_RAID6);
3317         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3318             (!alloc_profile_is_valid(bctl->data.target, 1) ||
3319              (bctl->data.target & ~allowed))) {
3320                 btrfs_err(fs_info, "unable to start balance with target "
3321                            "data profile %llu",
3322                        bctl->data.target);
3323                 ret = -EINVAL;
3324                 goto out;
3325         }
3326         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3327             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3328              (bctl->meta.target & ~allowed))) {
3329                 btrfs_err(fs_info,
3330                            "unable to start balance with target metadata profile %llu",
3331                        bctl->meta.target);
3332                 ret = -EINVAL;
3333                 goto out;
3334         }
3335         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3336             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3337              (bctl->sys.target & ~allowed))) {
3338                 btrfs_err(fs_info,
3339                            "unable to start balance with target system profile %llu",
3340                        bctl->sys.target);
3341                 ret = -EINVAL;
3342                 goto out;
3343         }
3344
3345         /* allow dup'ed data chunks only in mixed mode */
3346         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3347             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3348                 btrfs_err(fs_info, "dup for data is not allowed");
3349                 ret = -EINVAL;
3350                 goto out;
3351         }
3352
3353         /* allow to reduce meta or sys integrity only if force set */
3354         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3355                         BTRFS_BLOCK_GROUP_RAID10 |
3356                         BTRFS_BLOCK_GROUP_RAID5 |
3357                         BTRFS_BLOCK_GROUP_RAID6;
3358         do {
3359                 seq = read_seqbegin(&fs_info->profiles_lock);
3360
3361                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3362                      (fs_info->avail_system_alloc_bits & allowed) &&
3363                      !(bctl->sys.target & allowed)) ||
3364                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3365                      (fs_info->avail_metadata_alloc_bits & allowed) &&
3366                      !(bctl->meta.target & allowed))) {
3367                         if (bctl->flags & BTRFS_BALANCE_FORCE) {
3368                                 btrfs_info(fs_info, "force reducing metadata integrity");
3369                         } else {
3370                                 btrfs_err(fs_info, "balance will reduce metadata "
3371                                            "integrity, use force if you want this");
3372                                 ret = -EINVAL;
3373                                 goto out;
3374                         }
3375                 }
3376         } while (read_seqretry(&fs_info->profiles_lock, seq));
3377
3378         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3379                 int num_tolerated_disk_barrier_failures;
3380                 u64 target = bctl->sys.target;
3381
3382                 num_tolerated_disk_barrier_failures =
3383                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3384                 if (num_tolerated_disk_barrier_failures > 0 &&
3385                     (target &
3386                      (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3387                       BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3388                         num_tolerated_disk_barrier_failures = 0;
3389                 else if (num_tolerated_disk_barrier_failures > 1 &&
3390                          (target &
3391                           (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3392                         num_tolerated_disk_barrier_failures = 1;
3393
3394                 fs_info->num_tolerated_disk_barrier_failures =
3395                         num_tolerated_disk_barrier_failures;
3396         }
3397
3398         ret = insert_balance_item(fs_info->tree_root, bctl);
3399         if (ret && ret != -EEXIST)
3400                 goto out;
3401
3402         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3403                 BUG_ON(ret == -EEXIST);
3404                 set_balance_control(bctl);
3405         } else {
3406                 BUG_ON(ret != -EEXIST);
3407                 spin_lock(&fs_info->balance_lock);
3408                 update_balance_args(bctl);
3409                 spin_unlock(&fs_info->balance_lock);
3410         }
3411
3412         atomic_inc(&fs_info->balance_running);
3413         mutex_unlock(&fs_info->balance_mutex);
3414
3415         ret = __btrfs_balance(fs_info);
3416
3417         mutex_lock(&fs_info->balance_mutex);
3418         atomic_dec(&fs_info->balance_running);
3419
3420         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3421                 fs_info->num_tolerated_disk_barrier_failures =
3422                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3423         }
3424
3425         if (bargs) {
3426                 memset(bargs, 0, sizeof(*bargs));
3427                 update_ioctl_balance_args(fs_info, 0, bargs);
3428         }
3429
3430         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3431             balance_need_close(fs_info)) {
3432                 __cancel_balance(fs_info);
3433         }
3434
3435         wake_up(&fs_info->balance_wait_q);
3436
3437         return ret;
3438 out:
3439         if (bctl->flags & BTRFS_BALANCE_RESUME)
3440                 __cancel_balance(fs_info);
3441         else {
3442                 kfree(bctl);
3443                 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3444         }
3445         return ret;
3446 }
3447
3448 static int balance_kthread(void *data)
3449 {
3450         struct btrfs_fs_info *fs_info = data;
3451         int ret = 0;
3452
3453         mutex_lock(&fs_info->volume_mutex);
3454         mutex_lock(&fs_info->balance_mutex);
3455
3456         if (fs_info->balance_ctl) {
3457                 btrfs_info(fs_info, "continuing balance");
3458                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3459         }
3460
3461         mutex_unlock(&fs_info->balance_mutex);
3462         mutex_unlock(&fs_info->volume_mutex);
3463
3464         return ret;
3465 }
3466
3467 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3468 {
3469         struct task_struct *tsk;
3470
3471         spin_lock(&fs_info->balance_lock);
3472         if (!fs_info->balance_ctl) {
3473                 spin_unlock(&fs_info->balance_lock);
3474                 return 0;
3475         }
3476         spin_unlock(&fs_info->balance_lock);
3477
3478         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3479                 btrfs_info(fs_info, "force skipping balance");
3480                 return 0;
3481         }
3482
3483         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3484         return PTR_ERR_OR_ZERO(tsk);
3485 }
3486
3487 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3488 {
3489         struct btrfs_balance_control *bctl;
3490         struct btrfs_balance_item *item;
3491         struct btrfs_disk_balance_args disk_bargs;
3492         struct btrfs_path *path;
3493         struct extent_buffer *leaf;
3494         struct btrfs_key key;
3495         int ret;
3496
3497         path = btrfs_alloc_path();
3498         if (!path)
3499                 return -ENOMEM;
3500
3501         key.objectid = BTRFS_BALANCE_OBJECTID;
3502         key.type = BTRFS_BALANCE_ITEM_KEY;
3503         key.offset = 0;
3504
3505         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3506         if (ret < 0)
3507                 goto out;
3508         if (ret > 0) { /* ret = -ENOENT; */
3509                 ret = 0;
3510                 goto out;
3511         }
3512
3513         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3514         if (!bctl) {
3515                 ret = -ENOMEM;
3516                 goto out;
3517         }
3518
3519         leaf = path->nodes[0];
3520         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3521
3522         bctl->fs_info = fs_info;
3523         bctl->flags = btrfs_balance_flags(leaf, item);
3524         bctl->flags |= BTRFS_BALANCE_RESUME;
3525
3526         btrfs_balance_data(leaf, item, &disk_bargs);
3527         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3528         btrfs_balance_meta(leaf, item, &disk_bargs);
3529         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3530         btrfs_balance_sys(leaf, item, &disk_bargs);
3531         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3532
3533         WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3534
3535         mutex_lock(&fs_info->volume_mutex);
3536         mutex_lock(&fs_info->balance_mutex);
3537
3538         set_balance_control(bctl);
3539
3540         mutex_unlock(&fs_info->balance_mutex);
3541         mutex_unlock(&fs_info->volume_mutex);
3542 out:
3543         btrfs_free_path(path);
3544         return ret;
3545 }
3546
3547 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3548 {
3549         int ret = 0;
3550
3551         mutex_lock(&fs_info->balance_mutex);
3552         if (!fs_info->balance_ctl) {
3553                 mutex_unlock(&fs_info->balance_mutex);
3554                 return -ENOTCONN;
3555         }
3556
3557         if (atomic_read(&fs_info->balance_running)) {
3558                 atomic_inc(&fs_info->balance_pause_req);
3559                 mutex_unlock(&fs_info->balance_mutex);
3560
3561                 wait_event(fs_info->balance_wait_q,
3562                            atomic_read(&fs_info->balance_running) == 0);
3563
3564                 mutex_lock(&fs_info->balance_mutex);
3565                 /* we are good with balance_ctl ripped off from under us */
3566                 BUG_ON(atomic_read(&fs_info->balance_running));
3567                 atomic_dec(&fs_info->balance_pause_req);
3568         } else {
3569                 ret = -ENOTCONN;
3570         }
3571
3572         mutex_unlock(&fs_info->balance_mutex);
3573         return ret;
3574 }
3575
3576 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3577 {
3578         if (fs_info->sb->s_flags & MS_RDONLY)
3579                 return -EROFS;
3580
3581         mutex_lock(&fs_info->balance_mutex);
3582         if (!fs_info->balance_ctl) {
3583                 mutex_unlock(&fs_info->balance_mutex);
3584                 return -ENOTCONN;
3585         }
3586
3587         atomic_inc(&fs_info->balance_cancel_req);
3588         /*
3589          * if we are running just wait and return, balance item is
3590          * deleted in btrfs_balance in this case
3591          */
3592         if (atomic_read(&fs_info->balance_running)) {
3593                 mutex_unlock(&fs_info->balance_mutex);
3594                 wait_event(fs_info->balance_wait_q,
3595                            atomic_read(&fs_info->balance_running) == 0);
3596                 mutex_lock(&fs_info->balance_mutex);
3597         } else {
3598                 /* __cancel_balance needs volume_mutex */
3599                 mutex_unlock(&fs_info->balance_mutex);
3600                 mutex_lock(&fs_info->volume_mutex);
3601                 mutex_lock(&fs_info->balance_mutex);
3602
3603                 if (fs_info->balance_ctl)
3604                         __cancel_balance(fs_info);
3605
3606                 mutex_unlock(&fs_info->volume_mutex);
3607         }
3608
3609         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3610         atomic_dec(&fs_info->balance_cancel_req);
3611         mutex_unlock(&fs_info->balance_mutex);
3612         return 0;
3613 }
3614
3615 static int btrfs_uuid_scan_kthread(void *data)
3616 {
3617         struct btrfs_fs_info *fs_info = data;
3618         struct btrfs_root *root = fs_info->tree_root;
3619         struct btrfs_key key;
3620         struct btrfs_key max_key;
3621         struct btrfs_path *path = NULL;
3622         int ret = 0;
3623         struct extent_buffer *eb;
3624         int slot;
3625         struct btrfs_root_item root_item;
3626         u32 item_size;
3627         struct btrfs_trans_handle *trans = NULL;
3628
3629         path = btrfs_alloc_path();
3630         if (!path) {
3631                 ret = -ENOMEM;
3632                 goto out;
3633         }
3634
3635         key.objectid = 0;
3636         key.type = BTRFS_ROOT_ITEM_KEY;
3637         key.offset = 0;
3638
3639         max_key.objectid = (u64)-1;
3640         max_key.type = BTRFS_ROOT_ITEM_KEY;
3641         max_key.offset = (u64)-1;
3642
3643         path->keep_locks = 1;
3644
3645         while (1) {
3646                 ret = btrfs_search_forward(root, &key, path, 0);
3647                 if (ret) {
3648                         if (ret > 0)
3649                                 ret = 0;
3650                         break;
3651                 }
3652
3653                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
3654                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
3655                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
3656                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
3657                         goto skip;
3658
3659                 eb = path->nodes[0];
3660                 slot = path->slots[0];
3661                 item_size = btrfs_item_size_nr(eb, slot);
3662                 if (item_size < sizeof(root_item))
3663                         goto skip;
3664
3665                 read_extent_buffer(eb, &root_item,
3666                                    btrfs_item_ptr_offset(eb, slot),
3667                                    (int)sizeof(root_item));
3668                 if (btrfs_root_refs(&root_item) == 0)
3669                         goto skip;
3670
3671                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
3672                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
3673                         if (trans)
3674                                 goto update_tree;
3675
3676                         btrfs_release_path(path);
3677                         /*
3678                          * 1 - subvol uuid item
3679                          * 1 - received_subvol uuid item
3680                          */
3681                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
3682                         if (IS_ERR(trans)) {
3683                                 ret = PTR_ERR(trans);
3684                                 break;
3685                         }
3686                         continue;
3687                 } else {
3688                         goto skip;
3689                 }
3690 update_tree:
3691                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
3692                         ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3693                                                   root_item.uuid,
3694                                                   BTRFS_UUID_KEY_SUBVOL,
3695                                                   key.objectid);
3696                         if (ret < 0) {
3697                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3698                                         ret);
3699                                 break;
3700                         }
3701                 }
3702
3703                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
3704                         ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3705                                                   root_item.received_uuid,
3706                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3707                                                   key.objectid);
3708                         if (ret < 0) {
3709                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3710                                         ret);
3711                                 break;
3712                         }
3713                 }
3714
3715 skip:
3716                 if (trans) {
3717                         ret = btrfs_end_transaction(trans, fs_info->uuid_root);
3718                         trans = NULL;
3719                         if (ret)
3720                                 break;
3721                 }
3722
3723                 btrfs_release_path(path);
3724                 if (key.offset < (u64)-1) {
3725                         key.offset++;
3726                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
3727                         key.offset = 0;
3728                         key.type = BTRFS_ROOT_ITEM_KEY;
3729                 } else if (key.objectid < (u64)-1) {
3730                         key.offset = 0;
3731                         key.type = BTRFS_ROOT_ITEM_KEY;
3732                         key.objectid++;
3733                 } else {
3734                         break;
3735                 }
3736                 cond_resched();
3737         }
3738
3739 out:
3740         btrfs_free_path(path);
3741         if (trans && !IS_ERR(trans))
3742                 btrfs_end_transaction(trans, fs_info->uuid_root);
3743         if (ret)
3744                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3745         else
3746                 fs_info->update_uuid_tree_gen = 1;
3747         up(&fs_info->uuid_tree_rescan_sem);
3748         return 0;
3749 }
3750
3751 /*
3752  * Callback for btrfs_uuid_tree_iterate().
3753  * returns:
3754  * 0    check succeeded, the entry is not outdated.
3755  * < 0  if an error occured.
3756  * > 0  if the check failed, which means the caller shall remove the entry.
3757  */
3758 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
3759                                        u8 *uuid, u8 type, u64 subid)
3760 {
3761         struct btrfs_key key;
3762         int ret = 0;
3763         struct btrfs_root *subvol_root;
3764
3765         if (type != BTRFS_UUID_KEY_SUBVOL &&
3766             type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
3767                 goto out;
3768
3769         key.objectid = subid;
3770         key.type = BTRFS_ROOT_ITEM_KEY;
3771         key.offset = (u64)-1;
3772         subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
3773         if (IS_ERR(subvol_root)) {
3774                 ret = PTR_ERR(subvol_root);
3775                 if (ret == -ENOENT)
3776                         ret = 1;
3777                 goto out;
3778         }
3779
3780         switch (type) {
3781         case BTRFS_UUID_KEY_SUBVOL:
3782                 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
3783                         ret = 1;
3784                 break;
3785         case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
3786                 if (memcmp(uuid, subvol_root->root_item.received_uuid,
3787                            BTRFS_UUID_SIZE))
3788                         ret = 1;
3789                 break;
3790         }
3791
3792 out:
3793         return ret;
3794 }
3795
3796 static int btrfs_uuid_rescan_kthread(void *data)
3797 {
3798         struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3799         int ret;
3800
3801         /*
3802          * 1st step is to iterate through the existing UUID tree and
3803          * to delete all entries that contain outdated data.
3804          * 2nd step is to add all missing entries to the UUID tree.
3805          */
3806         ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3807         if (ret < 0) {
3808                 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3809                 up(&fs_info->uuid_tree_rescan_sem);
3810                 return ret;
3811         }
3812         return btrfs_uuid_scan_kthread(data);
3813 }
3814
3815 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3816 {
3817         struct btrfs_trans_handle *trans;
3818         struct btrfs_root *tree_root = fs_info->tree_root;
3819         struct btrfs_root *uuid_root;
3820         struct task_struct *task;
3821         int ret;
3822
3823         /*
3824          * 1 - root node
3825          * 1 - root item
3826          */
3827         trans = btrfs_start_transaction(tree_root, 2);
3828         if (IS_ERR(trans))
3829                 return PTR_ERR(trans);
3830
3831         uuid_root = btrfs_create_tree(trans, fs_info,
3832                                       BTRFS_UUID_TREE_OBJECTID);
3833         if (IS_ERR(uuid_root)) {
3834                 btrfs_abort_transaction(trans, tree_root,
3835                                         PTR_ERR(uuid_root));
3836                 return PTR_ERR(uuid_root);
3837         }
3838
3839         fs_info->uuid_root = uuid_root;
3840
3841         ret = btrfs_commit_transaction(trans, tree_root);
3842         if (ret)
3843                 return ret;
3844
3845         down(&fs_info->uuid_tree_rescan_sem);
3846         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
3847         if (IS_ERR(task)) {
3848                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3849                 btrfs_warn(fs_info, "failed to start uuid_scan task");
3850                 up(&fs_info->uuid_tree_rescan_sem);
3851                 return PTR_ERR(task);
3852         }
3853
3854         return 0;
3855 }
3856
3857 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3858 {
3859         struct task_struct *task;
3860
3861         down(&fs_info->uuid_tree_rescan_sem);
3862         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3863         if (IS_ERR(task)) {
3864                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3865                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
3866                 up(&fs_info->uuid_tree_rescan_sem);
3867                 return PTR_ERR(task);
3868         }
3869
3870         return 0;
3871 }
3872
3873 /*
3874  * shrinking a device means finding all of the device extents past
3875  * the new size, and then following the back refs to the chunks.
3876  * The chunk relocation code actually frees the device extent
3877  */
3878 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3879 {
3880         struct btrfs_trans_handle *trans;
3881         struct btrfs_root *root = device->dev_root;
3882         struct btrfs_dev_extent *dev_extent = NULL;
3883         struct btrfs_path *path;
3884         u64 length;
3885         u64 chunk_tree;
3886         u64 chunk_objectid;
3887         u64 chunk_offset;
3888         int ret;
3889         int slot;
3890         int failed = 0;
3891         bool retried = false;
3892         struct extent_buffer *l;
3893         struct btrfs_key key;
3894         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3895         u64 old_total = btrfs_super_total_bytes(super_copy);
3896         u64 old_size = device->total_bytes;
3897         u64 diff = device->total_bytes - new_size;
3898
3899         if (device->is_tgtdev_for_dev_replace)
3900                 return -EINVAL;
3901
3902         path = btrfs_alloc_path();
3903         if (!path)
3904                 return -ENOMEM;
3905
3906         path->reada = 2;
3907
3908         lock_chunks(root);
3909
3910         device->total_bytes = new_size;
3911         if (device->writeable) {
3912                 device->fs_devices->total_rw_bytes -= diff;
3913                 spin_lock(&root->fs_info->free_chunk_lock);
3914                 root->fs_info->free_chunk_space -= diff;
3915                 spin_unlock(&root->fs_info->free_chunk_lock);
3916         }
3917         unlock_chunks(root);
3918
3919 again:
3920         key.objectid = device->devid;
3921         key.offset = (u64)-1;
3922         key.type = BTRFS_DEV_EXTENT_KEY;
3923
3924         do {
3925                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3926                 if (ret < 0)
3927                         goto done;
3928
3929                 ret = btrfs_previous_item(root, path, 0, key.type);
3930                 if (ret < 0)
3931                         goto done;
3932                 if (ret) {
3933                         ret = 0;
3934                         btrfs_release_path(path);
3935                         break;
3936                 }
3937
3938                 l = path->nodes[0];
3939                 slot = path->slots[0];
3940                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3941
3942                 if (key.objectid != device->devid) {
3943                         btrfs_release_path(path);
3944                         break;
3945                 }
3946
3947                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3948                 length = btrfs_dev_extent_length(l, dev_extent);
3949
3950                 if (key.offset + length <= new_size) {
3951                         btrfs_release_path(path);
3952                         break;
3953                 }
3954
3955                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3956                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3957                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3958                 btrfs_release_path(path);
3959
3960                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3961                                            chunk_offset);
3962                 if (ret && ret != -ENOSPC)
3963                         goto done;
3964                 if (ret == -ENOSPC)
3965                         failed++;
3966         } while (key.offset-- > 0);
3967
3968         if (failed && !retried) {
3969                 failed = 0;
3970                 retried = true;
3971                 goto again;
3972         } else if (failed && retried) {
3973                 ret = -ENOSPC;
3974                 lock_chunks(root);
3975
3976                 device->total_bytes = old_size;
3977                 if (device->writeable)
3978                         device->fs_devices->total_rw_bytes += diff;
3979                 spin_lock(&root->fs_info->free_chunk_lock);
3980                 root->fs_info->free_chunk_space += diff;
3981                 spin_unlock(&root->fs_info->free_chunk_lock);
3982                 unlock_chunks(root);
3983                 goto done;
3984         }
3985
3986         /* Shrinking succeeded, else we would be at "done". */
3987         trans = btrfs_start_transaction(root, 0);
3988         if (IS_ERR(trans)) {
3989                 ret = PTR_ERR(trans);
3990                 goto done;
3991         }
3992
3993         lock_chunks(root);
3994
3995         device->disk_total_bytes = new_size;
3996         /* Now btrfs_update_device() will change the on-disk size. */
3997         ret = btrfs_update_device(trans, device);
3998         if (ret) {
3999                 unlock_chunks(root);
4000                 btrfs_end_transaction(trans, root);
4001                 goto done;
4002         }
4003         WARN_ON(diff > old_total);
4004         btrfs_set_super_total_bytes(super_copy, old_total - diff);
4005         unlock_chunks(root);
4006         btrfs_end_transaction(trans, root);
4007 done:
4008         btrfs_free_path(path);
4009         return ret;
4010 }
4011
4012 static int btrfs_add_system_chunk(struct btrfs_root *root,
4013                            struct btrfs_key *key,
4014                            struct btrfs_chunk *chunk, int item_size)
4015 {
4016         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4017         struct btrfs_disk_key disk_key;
4018         u32 array_size;
4019         u8 *ptr;
4020
4021         array_size = btrfs_super_sys_array_size(super_copy);
4022         if (array_size + item_size + sizeof(disk_key)
4023                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4024                 return -EFBIG;
4025
4026         ptr = super_copy->sys_chunk_array + array_size;
4027         btrfs_cpu_key_to_disk(&disk_key, key);
4028         memcpy(ptr, &disk_key, sizeof(disk_key));
4029         ptr += sizeof(disk_key);
4030         memcpy(ptr, chunk, item_size);
4031         item_size += sizeof(disk_key);
4032         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4033         return 0;
4034 }
4035
4036 /*
4037  * sort the devices in descending order by max_avail, total_avail
4038  */
4039 static int btrfs_cmp_device_info(const void *a, const void *b)
4040 {
4041         const struct btrfs_device_info *di_a = a;
4042         const struct btrfs_device_info *di_b = b;
4043
4044         if (di_a->max_avail > di_b->max_avail)
4045                 return -1;
4046         if (di_a->max_avail < di_b->max_avail)
4047                 return 1;
4048         if (di_a->total_avail > di_b->total_avail)
4049                 return -1;
4050         if (di_a->total_avail < di_b->total_avail)
4051                 return 1;
4052         return 0;
4053 }
4054
4055 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4056         [BTRFS_RAID_RAID10] = {
4057                 .sub_stripes    = 2,
4058                 .dev_stripes    = 1,
4059                 .devs_max       = 0,    /* 0 == as many as possible */
4060                 .devs_min       = 4,
4061                 .devs_increment = 2,
4062                 .ncopies        = 2,
4063         },
4064         [BTRFS_RAID_RAID1] = {
4065                 .sub_stripes    = 1,
4066                 .dev_stripes    = 1,
4067                 .devs_max       = 2,
4068                 .devs_min       = 2,
4069                 .devs_increment = 2,
4070                 .ncopies        = 2,
4071         },
4072         [BTRFS_RAID_DUP] = {
4073                 .sub_stripes    = 1,
4074                 .dev_stripes    = 2,
4075                 .devs_max       = 1,
4076                 .devs_min       = 1,
4077                 .devs_increment = 1,
4078                 .ncopies        = 2,
4079         },
4080         [BTRFS_RAID_RAID0] = {
4081                 .sub_stripes    = 1,
4082                 .dev_stripes    = 1,
4083                 .devs_max       = 0,
4084                 .devs_min       = 2,
4085                 .devs_increment = 1,
4086                 .ncopies        = 1,
4087         },
4088         [BTRFS_RAID_SINGLE] = {
4089                 .sub_stripes    = 1,
4090                 .dev_stripes    = 1,
4091                 .devs_max       = 1,
4092                 .devs_min       = 1,
4093                 .devs_increment = 1,
4094                 .ncopies        = 1,
4095         },
4096         [BTRFS_RAID_RAID5] = {
4097                 .sub_stripes    = 1,
4098                 .dev_stripes    = 1,
4099                 .devs_max       = 0,
4100                 .devs_min       = 2,
4101                 .devs_increment = 1,
4102                 .ncopies        = 2,
4103         },
4104         [BTRFS_RAID_RAID6] = {
4105                 .sub_stripes    = 1,
4106                 .dev_stripes    = 1,
4107                 .devs_max       = 0,
4108                 .devs_min       = 3,
4109                 .devs_increment = 1,
4110                 .ncopies        = 3,
4111         },
4112 };
4113
4114 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4115 {
4116         /* TODO allow them to set a preferred stripe size */
4117         return 64 * 1024;
4118 }
4119
4120 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4121 {
4122         if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
4123                 return;
4124
4125         btrfs_set_fs_incompat(info, RAID56);
4126 }
4127
4128 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)             \
4129                         - sizeof(struct btrfs_item)             \
4130                         - sizeof(struct btrfs_chunk))           \
4131                         / sizeof(struct btrfs_stripe) + 1)
4132
4133 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
4134                                 - 2 * sizeof(struct btrfs_disk_key)     \
4135                                 - 2 * sizeof(struct btrfs_chunk))       \
4136                                 / sizeof(struct btrfs_stripe) + 1)
4137
4138 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4139                                struct btrfs_root *extent_root, u64 start,
4140                                u64 type)
4141 {
4142         struct btrfs_fs_info *info = extent_root->fs_info;
4143         struct btrfs_fs_devices *fs_devices = info->fs_devices;
4144         struct list_head *cur;
4145         struct map_lookup *map = NULL;
4146         struct extent_map_tree *em_tree;
4147         struct extent_map *em;
4148         struct btrfs_device_info *devices_info = NULL;
4149         u64 total_avail;
4150         int num_stripes;        /* total number of stripes to allocate */
4151         int data_stripes;       /* number of stripes that count for
4152                                    block group size */
4153         int sub_stripes;        /* sub_stripes info for map */
4154         int dev_stripes;        /* stripes per dev */
4155         int devs_max;           /* max devs to use */
4156         int devs_min;           /* min devs needed */
4157         int devs_increment;     /* ndevs has to be a multiple of this */
4158         int ncopies;            /* how many copies to data has */
4159         int ret;
4160         u64 max_stripe_size;
4161         u64 max_chunk_size;
4162         u64 stripe_size;
4163         u64 num_bytes;
4164         u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4165         int ndevs;
4166         int i;
4167         int j;
4168         int index;
4169
4170         BUG_ON(!alloc_profile_is_valid(type, 0));
4171
4172         if (list_empty(&fs_devices->alloc_list))
4173                 return -ENOSPC;
4174
4175         index = __get_raid_index(type);
4176
4177         sub_stripes = btrfs_raid_array[index].sub_stripes;
4178         dev_stripes = btrfs_raid_array[index].dev_stripes;
4179         devs_max = btrfs_raid_array[index].devs_max;
4180         devs_min = btrfs_raid_array[index].devs_min;
4181         devs_increment = btrfs_raid_array[index].devs_increment;
4182         ncopies = btrfs_raid_array[index].ncopies;
4183
4184         if (type & BTRFS_BLOCK_GROUP_DATA) {
4185                 max_stripe_size = 1024 * 1024 * 1024;
4186                 max_chunk_size = 10 * max_stripe_size;
4187                 if (!devs_max)
4188                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4189         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4190                 /* for larger filesystems, use larger metadata chunks */
4191                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
4192                         max_stripe_size = 1024 * 1024 * 1024;
4193                 else
4194                         max_stripe_size = 256 * 1024 * 1024;
4195                 max_chunk_size = max_stripe_size;
4196                 if (!devs_max)
4197                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4198         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4199                 max_stripe_size = 32 * 1024 * 1024;
4200                 max_chunk_size = 2 * max_stripe_size;
4201                 if (!devs_max)
4202                         devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4203         } else {
4204                 btrfs_err(info, "invalid chunk type 0x%llx requested",
4205                        type);
4206                 BUG_ON(1);
4207         }
4208
4209         /* we don't want a chunk larger than 10% of writeable space */
4210         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4211                              max_chunk_size);
4212
4213         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
4214                                GFP_NOFS);
4215         if (!devices_info)
4216                 return -ENOMEM;
4217
4218         cur = fs_devices->alloc_list.next;
4219
4220         /*
4221          * in the first pass through the devices list, we gather information
4222          * about the available holes on each device.
4223          */
4224         ndevs = 0;
4225         while (cur != &fs_devices->alloc_list) {
4226                 struct btrfs_device *device;
4227                 u64 max_avail;
4228                 u64 dev_offset;
4229
4230                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4231
4232                 cur = cur->next;
4233
4234                 if (!device->writeable) {
4235                         WARN(1, KERN_ERR
4236                                "BTRFS: read-only device in alloc_list\n");
4237                         continue;
4238                 }
4239
4240                 if (!device->in_fs_metadata ||
4241                     device->is_tgtdev_for_dev_replace)
4242                         continue;
4243
4244                 if (device->total_bytes > device->bytes_used)
4245                         total_avail = device->total_bytes - device->bytes_used;
4246                 else
4247                         total_avail = 0;
4248
4249                 /* If there is no space on this device, skip it. */
4250                 if (total_avail == 0)
4251                         continue;
4252
4253                 ret = find_free_dev_extent(trans, device,
4254                                            max_stripe_size * dev_stripes,
4255                                            &dev_offset, &max_avail);
4256                 if (ret && ret != -ENOSPC)
4257                         goto error;
4258
4259                 if (ret == 0)
4260                         max_avail = max_stripe_size * dev_stripes;
4261
4262                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4263                         continue;
4264
4265                 if (ndevs == fs_devices->rw_devices) {
4266                         WARN(1, "%s: found more than %llu devices\n",
4267                              __func__, fs_devices->rw_devices);
4268                         break;
4269                 }
4270                 devices_info[ndevs].dev_offset = dev_offset;
4271                 devices_info[ndevs].max_avail = max_avail;
4272                 devices_info[ndevs].total_avail = total_avail;
4273                 devices_info[ndevs].dev = device;
4274                 ++ndevs;
4275         }
4276
4277         /*
4278          * now sort the devices by hole size / available space
4279          */
4280         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4281              btrfs_cmp_device_info, NULL);
4282
4283         /* round down to number of usable stripes */
4284         ndevs -= ndevs % devs_increment;
4285
4286         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4287                 ret = -ENOSPC;
4288                 goto error;
4289         }
4290
4291         if (devs_max && ndevs > devs_max)
4292                 ndevs = devs_max;
4293         /*
4294          * the primary goal is to maximize the number of stripes, so use as many
4295          * devices as possible, even if the stripes are not maximum sized.
4296          */
4297         stripe_size = devices_info[ndevs-1].max_avail;
4298         num_stripes = ndevs * dev_stripes;
4299
4300         /*
4301          * this will have to be fixed for RAID1 and RAID10 over
4302          * more drives
4303          */
4304         data_stripes = num_stripes / ncopies;
4305
4306         if (type & BTRFS_BLOCK_GROUP_RAID5) {
4307                 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4308                                  btrfs_super_stripesize(info->super_copy));
4309                 data_stripes = num_stripes - 1;
4310         }
4311         if (type & BTRFS_BLOCK_GROUP_RAID6) {
4312                 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4313                                  btrfs_super_stripesize(info->super_copy));
4314                 data_stripes = num_stripes - 2;
4315         }
4316
4317         /*
4318          * Use the number of data stripes to figure out how big this chunk
4319          * is really going to be in terms of logical address space,
4320          * and compare that answer with the max chunk size
4321          */
4322         if (stripe_size * data_stripes > max_chunk_size) {
4323                 u64 mask = (1ULL << 24) - 1;
4324                 stripe_size = max_chunk_size;
4325                 do_div(stripe_size, data_stripes);
4326
4327                 /* bump the answer up to a 16MB boundary */
4328                 stripe_size = (stripe_size + mask) & ~mask;
4329
4330                 /* but don't go higher than the limits we found
4331                  * while searching for free extents
4332                  */
4333                 if (stripe_size > devices_info[ndevs-1].max_avail)
4334                         stripe_size = devices_info[ndevs-1].max_avail;
4335         }
4336
4337         do_div(stripe_size, dev_stripes);
4338
4339         /* align to BTRFS_STRIPE_LEN */
4340         do_div(stripe_size, raid_stripe_len);
4341         stripe_size *= raid_stripe_len;
4342
4343         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4344         if (!map) {
4345                 ret = -ENOMEM;
4346                 goto error;
4347         }
4348         map->num_stripes = num_stripes;
4349
4350         for (i = 0; i < ndevs; ++i) {
4351                 for (j = 0; j < dev_stripes; ++j) {
4352                         int s = i * dev_stripes + j;
4353                         map->stripes[s].dev = devices_info[i].dev;
4354                         map->stripes[s].physical = devices_info[i].dev_offset +
4355                                                    j * stripe_size;
4356                 }
4357         }
4358         map->sector_size = extent_root->sectorsize;
4359         map->stripe_len = raid_stripe_len;
4360         map->io_align = raid_stripe_len;
4361         map->io_width = raid_stripe_len;
4362         map->type = type;
4363         map->sub_stripes = sub_stripes;
4364
4365         num_bytes = stripe_size * data_stripes;
4366
4367         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
4368
4369         em = alloc_extent_map();
4370         if (!em) {
4371                 kfree(map);
4372                 ret = -ENOMEM;
4373                 goto error;
4374         }
4375         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4376         em->bdev = (struct block_device *)map;
4377         em->start = start;
4378         em->len = num_bytes;
4379         em->block_start = 0;
4380         em->block_len = em->len;
4381         em->orig_block_len = stripe_size;
4382
4383         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4384         write_lock(&em_tree->lock);
4385         ret = add_extent_mapping(em_tree, em, 0);
4386         if (!ret) {
4387                 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4388                 atomic_inc(&em->refs);
4389         }
4390         write_unlock(&em_tree->lock);
4391         if (ret) {
4392                 free_extent_map(em);
4393                 goto error;
4394         }
4395
4396         ret = btrfs_make_block_group(trans, extent_root, 0, type,
4397                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4398                                      start, num_bytes);
4399         if (ret)
4400                 goto error_del_extent;
4401
4402         free_extent_map(em);
4403         check_raid56_incompat_flag(extent_root->fs_info, type);
4404
4405         kfree(devices_info);
4406         return 0;
4407
4408 error_del_extent:
4409         write_lock(&em_tree->lock);
4410         remove_extent_mapping(em_tree, em);
4411         write_unlock(&em_tree->lock);
4412
4413         /* One for our allocation */
4414         free_extent_map(em);
4415         /* One for the tree reference */
4416         free_extent_map(em);
4417 error:
4418         kfree(devices_info);
4419         return ret;
4420 }
4421
4422 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4423                                 struct btrfs_root *extent_root,
4424                                 u64 chunk_offset, u64 chunk_size)
4425 {
4426         struct btrfs_key key;
4427         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4428         struct btrfs_device *device;
4429         struct btrfs_chunk *chunk;
4430         struct btrfs_stripe *stripe;
4431         struct extent_map_tree *em_tree;
4432         struct extent_map *em;
4433         struct map_lookup *map;
4434         size_t item_size;
4435         u64 dev_offset;
4436         u64 stripe_size;
4437         int i = 0;
4438         int ret;
4439
4440         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4441         read_lock(&em_tree->lock);
4442         em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
4443         read_unlock(&em_tree->lock);
4444
4445         if (!em) {
4446                 btrfs_crit(extent_root->fs_info, "unable to find logical "
4447                            "%Lu len %Lu", chunk_offset, chunk_size);
4448                 return -EINVAL;
4449         }
4450
4451         if (em->start != chunk_offset || em->len != chunk_size) {
4452                 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4453                           " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4454                           chunk_size, em->start, em->len);
4455                 free_extent_map(em);
4456                 return -EINVAL;
4457         }
4458
4459         map = (struct map_lookup *)em->bdev;
4460         item_size = btrfs_chunk_item_size(map->num_stripes);
4461         stripe_size = em->orig_block_len;
4462
4463         chunk = kzalloc(item_size, GFP_NOFS);
4464         if (!chunk) {
4465                 ret = -ENOMEM;
4466                 goto out;
4467         }
4468
4469         for (i = 0; i < map->num_stripes; i++) {
4470                 device = map->stripes[i].dev;
4471                 dev_offset = map->stripes[i].physical;
4472
4473                 device->bytes_used += stripe_size;
4474                 ret = btrfs_update_device(trans, device);
4475                 if (ret)
4476                         goto out;
4477                 ret = btrfs_alloc_dev_extent(trans, device,
4478                                              chunk_root->root_key.objectid,
4479                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4480                                              chunk_offset, dev_offset,
4481                                              stripe_size);
4482                 if (ret)
4483                         goto out;
4484         }
4485
4486         spin_lock(&extent_root->fs_info->free_chunk_lock);
4487         extent_root->fs_info->free_chunk_space -= (stripe_size *
4488                                                    map->num_stripes);
4489         spin_unlock(&extent_root->fs_info->free_chunk_lock);
4490
4491         stripe = &chunk->stripe;
4492         for (i = 0; i < map->num_stripes; i++) {
4493                 device = map->stripes[i].dev;
4494                 dev_offset = map->stripes[i].physical;
4495
4496                 btrfs_set_stack_stripe_devid(stripe, device->devid);
4497                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4498                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4499                 stripe++;
4500         }
4501
4502         btrfs_set_stack_chunk_length(chunk, chunk_size);
4503         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4504         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4505         btrfs_set_stack_chunk_type(chunk, map->type);
4506         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4507         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4508         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4509         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4510         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4511
4512         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4513         key.type = BTRFS_CHUNK_ITEM_KEY;
4514         key.offset = chunk_offset;
4515
4516         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4517         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4518                 /*
4519                  * TODO: Cleanup of inserted chunk root in case of
4520                  * failure.
4521                  */
4522                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4523                                              item_size);
4524         }
4525
4526 out:
4527         kfree(chunk);
4528         free_extent_map(em);
4529         return ret;
4530 }
4531
4532 /*
4533  * Chunk allocation falls into two parts. The first part does works
4534  * that make the new allocated chunk useable, but not do any operation
4535  * that modifies the chunk tree. The second part does the works that
4536  * require modifying the chunk tree. This division is important for the
4537  * bootstrap process of adding storage to a seed btrfs.
4538  */
4539 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4540                       struct btrfs_root *extent_root, u64 type)
4541 {
4542         u64 chunk_offset;
4543
4544         chunk_offset = find_next_chunk(extent_root->fs_info);
4545         return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4546 }
4547
4548 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4549                                          struct btrfs_root *root,
4550                                          struct btrfs_device *device)
4551 {
4552         u64 chunk_offset;
4553         u64 sys_chunk_offset;
4554         u64 alloc_profile;
4555         struct btrfs_fs_info *fs_info = root->fs_info;
4556         struct btrfs_root *extent_root = fs_info->extent_root;
4557         int ret;
4558
4559         chunk_offset = find_next_chunk(fs_info);
4560         alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4561         ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4562                                   alloc_profile);
4563         if (ret)
4564                 return ret;
4565
4566         sys_chunk_offset = find_next_chunk(root->fs_info);
4567         alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4568         ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4569                                   alloc_profile);
4570         if (ret) {
4571                 btrfs_abort_transaction(trans, root, ret);
4572                 goto out;
4573         }
4574
4575         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4576         if (ret)
4577                 btrfs_abort_transaction(trans, root, ret);
4578 out:
4579         return ret;
4580 }
4581
4582 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4583 {
4584         struct extent_map *em;
4585         struct map_lookup *map;
4586         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4587         int readonly = 0;
4588         int i;
4589
4590         read_lock(&map_tree->map_tree.lock);
4591         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4592         read_unlock(&map_tree->map_tree.lock);
4593         if (!em)
4594                 return 1;
4595
4596         if (btrfs_test_opt(root, DEGRADED)) {
4597                 free_extent_map(em);
4598                 return 0;
4599         }
4600
4601         map = (struct map_lookup *)em->bdev;
4602         for (i = 0; i < map->num_stripes; i++) {
4603                 if (!map->stripes[i].dev->writeable) {
4604                         readonly = 1;
4605                         break;
4606                 }
4607         }
4608         free_extent_map(em);
4609         return readonly;
4610 }
4611
4612 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4613 {
4614         extent_map_tree_init(&tree->map_tree);
4615 }
4616
4617 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4618 {
4619         struct extent_map *em;
4620
4621         while (1) {
4622                 write_lock(&tree->map_tree.lock);
4623                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4624                 if (em)
4625                         remove_extent_mapping(&tree->map_tree, em);
4626                 write_unlock(&tree->map_tree.lock);
4627                 if (!em)
4628                         break;
4629                 /* once for us */
4630                 free_extent_map(em);
4631                 /* once for the tree */
4632                 free_extent_map(em);
4633         }
4634 }
4635
4636 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4637 {
4638         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4639         struct extent_map *em;
4640         struct map_lookup *map;
4641         struct extent_map_tree *em_tree = &map_tree->map_tree;
4642         int ret;
4643
4644         read_lock(&em_tree->lock);
4645         em = lookup_extent_mapping(em_tree, logical, len);
4646         read_unlock(&em_tree->lock);
4647
4648         /*
4649          * We could return errors for these cases, but that could get ugly and
4650          * we'd probably do the same thing which is just not do anything else
4651          * and exit, so return 1 so the callers don't try to use other copies.
4652          */
4653         if (!em) {
4654                 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4655                             logical+len);
4656                 return 1;
4657         }
4658
4659         if (em->start > logical || em->start + em->len < logical) {
4660                 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4661                             "%Lu-%Lu", logical, logical+len, em->start,
4662                             em->start + em->len);
4663                 free_extent_map(em);
4664                 return 1;
4665         }
4666
4667         map = (struct map_lookup *)em->bdev;
4668         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4669                 ret = map->num_stripes;
4670         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4671                 ret = map->sub_stripes;
4672         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4673                 ret = 2;
4674         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4675                 ret = 3;
4676         else
4677                 ret = 1;
4678         free_extent_map(em);
4679
4680         btrfs_dev_replace_lock(&fs_info->dev_replace);
4681         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4682                 ret++;
4683         btrfs_dev_replace_unlock(&fs_info->dev_replace);
4684
4685         return ret;
4686 }
4687
4688 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4689                                     struct btrfs_mapping_tree *map_tree,
4690                                     u64 logical)
4691 {
4692         struct extent_map *em;
4693         struct map_lookup *map;
4694         struct extent_map_tree *em_tree = &map_tree->map_tree;
4695         unsigned long len = root->sectorsize;
4696
4697         read_lock(&em_tree->lock);
4698         em = lookup_extent_mapping(em_tree, logical, len);
4699         read_unlock(&em_tree->lock);
4700         BUG_ON(!em);
4701
4702         BUG_ON(em->start > logical || em->start + em->len < logical);
4703         map = (struct map_lookup *)em->bdev;
4704         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4705                          BTRFS_BLOCK_GROUP_RAID6)) {
4706                 len = map->stripe_len * nr_data_stripes(map);
4707         }
4708         free_extent_map(em);
4709         return len;
4710 }
4711
4712 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4713                            u64 logical, u64 len, int mirror_num)
4714 {
4715         struct extent_map *em;
4716         struct map_lookup *map;
4717         struct extent_map_tree *em_tree = &map_tree->map_tree;
4718         int ret = 0;
4719
4720         read_lock(&em_tree->lock);
4721         em = lookup_extent_mapping(em_tree, logical, len);
4722         read_unlock(&em_tree->lock);
4723         BUG_ON(!em);
4724
4725         BUG_ON(em->start > logical || em->start + em->len < logical);
4726         map = (struct map_lookup *)em->bdev;
4727         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4728                          BTRFS_BLOCK_GROUP_RAID6))
4729                 ret = 1;
4730         free_extent_map(em);
4731         return ret;
4732 }
4733
4734 static int find_live_mirror(struct btrfs_fs_info *fs_info,
4735                             struct map_lookup *map, int first, int num,
4736                             int optimal, int dev_replace_is_ongoing)
4737 {
4738         int i;
4739         int tolerance;
4740         struct btrfs_device *srcdev;
4741
4742         if (dev_replace_is_ongoing &&
4743             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4744              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4745                 srcdev = fs_info->dev_replace.srcdev;
4746         else
4747                 srcdev = NULL;
4748
4749         /*
4750          * try to avoid the drive that is the source drive for a
4751          * dev-replace procedure, only choose it if no other non-missing
4752          * mirror is available
4753          */
4754         for (tolerance = 0; tolerance < 2; tolerance++) {
4755                 if (map->stripes[optimal].dev->bdev &&
4756                     (tolerance || map->stripes[optimal].dev != srcdev))
4757                         return optimal;
4758                 for (i = first; i < first + num; i++) {
4759                         if (map->stripes[i].dev->bdev &&
4760                             (tolerance || map->stripes[i].dev != srcdev))
4761                                 return i;
4762                 }
4763         }
4764
4765         /* we couldn't find one that doesn't fail.  Just return something
4766          * and the io error handling code will clean up eventually
4767          */
4768         return optimal;
4769 }
4770
4771 static inline int parity_smaller(u64 a, u64 b)
4772 {
4773         return a > b;
4774 }
4775
4776 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4777 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4778 {
4779         struct btrfs_bio_stripe s;
4780         int i;
4781         u64 l;
4782         int again = 1;
4783
4784         while (again) {
4785                 again = 0;
4786                 for (i = 0; i < bbio->num_stripes - 1; i++) {
4787                         if (parity_smaller(raid_map[i], raid_map[i+1])) {
4788                                 s = bbio->stripes[i];
4789                                 l = raid_map[i];
4790                                 bbio->stripes[i] = bbio->stripes[i+1];
4791                                 raid_map[i] = raid_map[i+1];
4792                                 bbio->stripes[i+1] = s;
4793                                 raid_map[i+1] = l;
4794                                 again = 1;
4795                         }
4796                 }
4797         }
4798 }
4799
4800 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4801                              u64 logical, u64 *length,
4802                              struct btrfs_bio **bbio_ret,
4803                              int mirror_num, u64 **raid_map_ret)
4804 {
4805         struct extent_map *em;
4806         struct map_lookup *map;
4807         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4808         struct extent_map_tree *em_tree = &map_tree->map_tree;
4809         u64 offset;
4810         u64 stripe_offset;
4811         u64 stripe_end_offset;
4812         u64 stripe_nr;
4813         u64 stripe_nr_orig;
4814         u64 stripe_nr_end;
4815         u64 stripe_len;
4816         u64 *raid_map = NULL;
4817         int stripe_index;
4818         int i;
4819         int ret = 0;
4820         int num_stripes;
4821         int max_errors = 0;
4822         struct btrfs_bio *bbio = NULL;
4823         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4824         int dev_replace_is_ongoing = 0;
4825         int num_alloc_stripes;
4826         int patch_the_first_stripe_for_dev_replace = 0;
4827         u64 physical_to_patch_in_first_stripe = 0;
4828         u64 raid56_full_stripe_start = (u64)-1;
4829
4830         read_lock(&em_tree->lock);
4831         em = lookup_extent_mapping(em_tree, logical, *length);
4832         read_unlock(&em_tree->lock);
4833
4834         if (!em) {
4835                 btrfs_crit(fs_info, "unable to find logical %llu len %llu",
4836                         logical, *length);
4837                 return -EINVAL;
4838         }
4839
4840         if (em->start > logical || em->start + em->len < logical) {
4841                 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4842                            "found %Lu-%Lu", logical, em->start,
4843                            em->start + em->len);
4844                 free_extent_map(em);
4845                 return -EINVAL;
4846         }
4847
4848         map = (struct map_lookup *)em->bdev;
4849         offset = logical - em->start;
4850
4851         stripe_len = map->stripe_len;
4852         stripe_nr = offset;
4853         /*
4854          * stripe_nr counts the total number of stripes we have to stride
4855          * to get to this block
4856          */
4857         do_div(stripe_nr, stripe_len);
4858
4859         stripe_offset = stripe_nr * stripe_len;
4860         BUG_ON(offset < stripe_offset);
4861
4862         /* stripe_offset is the offset of this block in its stripe*/
4863         stripe_offset = offset - stripe_offset;
4864
4865         /* if we're here for raid56, we need to know the stripe aligned start */
4866         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4867                 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4868                 raid56_full_stripe_start = offset;
4869
4870                 /* allow a write of a full stripe, but make sure we don't
4871                  * allow straddling of stripes
4872                  */
4873                 do_div(raid56_full_stripe_start, full_stripe_len);
4874                 raid56_full_stripe_start *= full_stripe_len;
4875         }
4876
4877         if (rw & REQ_DISCARD) {
4878                 /* we don't discard raid56 yet */
4879                 if (map->type &
4880                     (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4881                         ret = -EOPNOTSUPP;
4882                         goto out;
4883                 }
4884                 *length = min_t(u64, em->len - offset, *length);
4885         } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4886                 u64 max_len;
4887                 /* For writes to RAID[56], allow a full stripeset across all disks.
4888                    For other RAID types and for RAID[56] reads, just allow a single
4889                    stripe (on a single disk). */
4890                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4891                     (rw & REQ_WRITE)) {
4892                         max_len = stripe_len * nr_data_stripes(map) -
4893                                 (offset - raid56_full_stripe_start);
4894                 } else {
4895                         /* we limit the length of each bio to what fits in a stripe */
4896                         max_len = stripe_len - stripe_offset;
4897                 }
4898                 *length = min_t(u64, em->len - offset, max_len);
4899         } else {
4900                 *length = em->len - offset;
4901         }
4902
4903         /* This is for when we're called from btrfs_merge_bio_hook() and all
4904            it cares about is the length */
4905         if (!bbio_ret)
4906                 goto out;
4907
4908         btrfs_dev_replace_lock(dev_replace);
4909         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4910         if (!dev_replace_is_ongoing)
4911                 btrfs_dev_replace_unlock(dev_replace);
4912
4913         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4914             !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4915             dev_replace->tgtdev != NULL) {
4916                 /*
4917                  * in dev-replace case, for repair case (that's the only
4918                  * case where the mirror is selected explicitly when
4919                  * calling btrfs_map_block), blocks left of the left cursor
4920                  * can also be read from the target drive.
4921                  * For REQ_GET_READ_MIRRORS, the target drive is added as
4922                  * the last one to the array of stripes. For READ, it also
4923                  * needs to be supported using the same mirror number.
4924                  * If the requested block is not left of the left cursor,
4925                  * EIO is returned. This can happen because btrfs_num_copies()
4926                  * returns one more in the dev-replace case.
4927                  */
4928                 u64 tmp_length = *length;
4929                 struct btrfs_bio *tmp_bbio = NULL;
4930                 int tmp_num_stripes;
4931                 u64 srcdev_devid = dev_replace->srcdev->devid;
4932                 int index_srcdev = 0;
4933                 int found = 0;
4934                 u64 physical_of_found = 0;
4935
4936                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4937                              logical, &tmp_length, &tmp_bbio, 0, NULL);
4938                 if (ret) {
4939                         WARN_ON(tmp_bbio != NULL);
4940                         goto out;
4941                 }
4942
4943                 tmp_num_stripes = tmp_bbio->num_stripes;
4944                 if (mirror_num > tmp_num_stripes) {
4945                         /*
4946                          * REQ_GET_READ_MIRRORS does not contain this
4947                          * mirror, that means that the requested area
4948                          * is not left of the left cursor
4949                          */
4950                         ret = -EIO;
4951                         kfree(tmp_bbio);
4952                         goto out;
4953                 }
4954
4955                 /*
4956                  * process the rest of the function using the mirror_num
4957                  * of the source drive. Therefore look it up first.
4958                  * At the end, patch the device pointer to the one of the
4959                  * target drive.
4960                  */
4961                 for (i = 0; i < tmp_num_stripes; i++) {
4962                         if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4963                                 /*
4964                                  * In case of DUP, in order to keep it
4965                                  * simple, only add the mirror with the
4966                                  * lowest physical address
4967                                  */
4968                                 if (found &&
4969                                     physical_of_found <=
4970                                      tmp_bbio->stripes[i].physical)
4971                                         continue;
4972                                 index_srcdev = i;
4973                                 found = 1;
4974                                 physical_of_found =
4975                                         tmp_bbio->stripes[i].physical;
4976                         }
4977                 }
4978
4979                 if (found) {
4980                         mirror_num = index_srcdev + 1;
4981                         patch_the_first_stripe_for_dev_replace = 1;
4982                         physical_to_patch_in_first_stripe = physical_of_found;
4983                 } else {
4984                         WARN_ON(1);
4985                         ret = -EIO;
4986                         kfree(tmp_bbio);
4987                         goto out;
4988                 }
4989
4990                 kfree(tmp_bbio);
4991         } else if (mirror_num > map->num_stripes) {
4992                 mirror_num = 0;
4993         }
4994
4995         num_stripes = 1;
4996         stripe_index = 0;
4997         stripe_nr_orig = stripe_nr;
4998         stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4999         do_div(stripe_nr_end, map->stripe_len);
5000         stripe_end_offset = stripe_nr_end * map->stripe_len -
5001                             (offset + *length);
5002
5003         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5004                 if (rw & REQ_DISCARD)
5005                         num_stripes = min_t(u64, map->num_stripes,
5006                                             stripe_nr_end - stripe_nr_orig);
5007                 stripe_index = do_div(stripe_nr, map->num_stripes);
5008         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5009                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5010                         num_stripes = map->num_stripes;
5011                 else if (mirror_num)
5012                         stripe_index = mirror_num - 1;
5013                 else {
5014                         stripe_index = find_live_mirror(fs_info, map, 0,
5015                                             map->num_stripes,
5016                                             current->pid % map->num_stripes,
5017                                             dev_replace_is_ongoing);
5018                         mirror_num = stripe_index + 1;
5019                 }
5020
5021         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5022                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
5023                         num_stripes = map->num_stripes;
5024                 } else if (mirror_num) {
5025                         stripe_index = mirror_num - 1;
5026                 } else {
5027                         mirror_num = 1;
5028                 }
5029
5030         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5031                 int factor = map->num_stripes / map->sub_stripes;
5032
5033                 stripe_index = do_div(stripe_nr, factor);
5034                 stripe_index *= map->sub_stripes;
5035
5036                 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5037                         num_stripes = map->sub_stripes;
5038                 else if (rw & REQ_DISCARD)
5039                         num_stripes = min_t(u64, map->sub_stripes *
5040                                             (stripe_nr_end - stripe_nr_orig),
5041                                             map->num_stripes);
5042                 else if (mirror_num)
5043                         stripe_index += mirror_num - 1;
5044                 else {
5045                         int old_stripe_index = stripe_index;
5046                         stripe_index = find_live_mirror(fs_info, map,
5047                                               stripe_index,
5048                                               map->sub_stripes, stripe_index +
5049                                               current->pid % map->sub_stripes,
5050                                               dev_replace_is_ongoing);
5051                         mirror_num = stripe_index - old_stripe_index + 1;
5052                 }
5053
5054         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5055                                 BTRFS_BLOCK_GROUP_RAID6)) {
5056                 u64 tmp;
5057
5058                 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
5059                     && raid_map_ret) {
5060                         int i, rot;
5061
5062                         /* push stripe_nr back to the start of the full stripe */
5063                         stripe_nr = raid56_full_stripe_start;
5064                         do_div(stripe_nr, stripe_len);
5065
5066                         stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5067
5068                         /* RAID[56] write or recovery. Return all stripes */
5069                         num_stripes = map->num_stripes;
5070                         max_errors = nr_parity_stripes(map);
5071
5072                         raid_map = kmalloc_array(num_stripes, sizeof(u64),
5073                                            GFP_NOFS);
5074                         if (!raid_map) {
5075                                 ret = -ENOMEM;
5076                                 goto out;
5077                         }
5078
5079                         /* Work out the disk rotation on this stripe-set */
5080                         tmp = stripe_nr;
5081                         rot = do_div(tmp, num_stripes);
5082
5083                         /* Fill in the logical address of each stripe */
5084                         tmp = stripe_nr * nr_data_stripes(map);
5085                         for (i = 0; i < nr_data_stripes(map); i++)
5086                                 raid_map[(i+rot) % num_stripes] =
5087                                         em->start + (tmp + i) * map->stripe_len;
5088
5089                         raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5090                         if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5091                                 raid_map[(i+rot+1) % num_stripes] =
5092                                         RAID6_Q_STRIPE;
5093
5094                         *length = map->stripe_len;
5095                         stripe_index = 0;
5096                         stripe_offset = 0;
5097                 } else {
5098                         /*
5099                          * Mirror #0 or #1 means the original data block.
5100                          * Mirror #2 is RAID5 parity block.
5101                          * Mirror #3 is RAID6 Q block.
5102                          */
5103                         stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5104                         if (mirror_num > 1)
5105                                 stripe_index = nr_data_stripes(map) +
5106                                                 mirror_num - 2;
5107
5108                         /* We distribute the parity blocks across stripes */
5109                         tmp = stripe_nr + stripe_index;
5110                         stripe_index = do_div(tmp, map->num_stripes);
5111                 }
5112         } else {
5113                 /*
5114                  * after this do_div call, stripe_nr is the number of stripes
5115                  * on this device we have to walk to find the data, and
5116                  * stripe_index is the number of our device in the stripe array
5117                  */
5118                 stripe_index = do_div(stripe_nr, map->num_stripes);
5119                 mirror_num = stripe_index + 1;
5120         }
5121         BUG_ON(stripe_index >= map->num_stripes);
5122
5123         num_alloc_stripes = num_stripes;
5124         if (dev_replace_is_ongoing) {
5125                 if (rw & (REQ_WRITE | REQ_DISCARD))
5126                         num_alloc_stripes <<= 1;
5127                 if (rw & REQ_GET_READ_MIRRORS)
5128                         num_alloc_stripes++;
5129         }
5130         bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
5131         if (!bbio) {
5132                 kfree(raid_map);
5133                 ret = -ENOMEM;
5134                 goto out;
5135         }
5136         atomic_set(&bbio->error, 0);
5137
5138         if (rw & REQ_DISCARD) {
5139                 int factor = 0;
5140                 int sub_stripes = 0;
5141                 u64 stripes_per_dev = 0;
5142                 u32 remaining_stripes = 0;
5143                 u32 last_stripe = 0;
5144
5145                 if (map->type &
5146                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
5147                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5148                                 sub_stripes = 1;
5149                         else
5150                                 sub_stripes = map->sub_stripes;
5151
5152                         factor = map->num_stripes / sub_stripes;
5153                         stripes_per_dev = div_u64_rem(stripe_nr_end -
5154                                                       stripe_nr_orig,
5155                                                       factor,
5156                                                       &remaining_stripes);
5157                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5158                         last_stripe *= sub_stripes;
5159                 }
5160
5161                 for (i = 0; i < num_stripes; i++) {
5162                         bbio->stripes[i].physical =
5163                                 map->stripes[stripe_index].physical +
5164                                 stripe_offset + stripe_nr * map->stripe_len;
5165                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5166
5167                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5168                                          BTRFS_BLOCK_GROUP_RAID10)) {
5169                                 bbio->stripes[i].length = stripes_per_dev *
5170                                                           map->stripe_len;
5171
5172                                 if (i / sub_stripes < remaining_stripes)
5173                                         bbio->stripes[i].length +=
5174                                                 map->stripe_len;
5175
5176                                 /*
5177                                  * Special for the first stripe and
5178                                  * the last stripe:
5179                                  *
5180                                  * |-------|...|-------|
5181                                  *     |----------|
5182                                  *    off     end_off
5183                                  */
5184                                 if (i < sub_stripes)
5185                                         bbio->stripes[i].length -=
5186                                                 stripe_offset;
5187
5188                                 if (stripe_index >= last_stripe &&
5189                                     stripe_index <= (last_stripe +
5190                                                      sub_stripes - 1))
5191                                         bbio->stripes[i].length -=
5192                                                 stripe_end_offset;
5193
5194                                 if (i == sub_stripes - 1)
5195                                         stripe_offset = 0;
5196                         } else
5197                                 bbio->stripes[i].length = *length;
5198
5199                         stripe_index++;
5200                         if (stripe_index == map->num_stripes) {
5201                                 /* This could only happen for RAID0/10 */
5202                                 stripe_index = 0;
5203                                 stripe_nr++;
5204                         }
5205                 }
5206         } else {
5207                 for (i = 0; i < num_stripes; i++) {
5208                         bbio->stripes[i].physical =
5209                                 map->stripes[stripe_index].physical +
5210                                 stripe_offset +
5211                                 stripe_nr * map->stripe_len;
5212                         bbio->stripes[i].dev =
5213                                 map->stripes[stripe_index].dev;
5214                         stripe_index++;
5215                 }
5216         }
5217
5218         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
5219                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5220                                  BTRFS_BLOCK_GROUP_RAID10 |
5221                                  BTRFS_BLOCK_GROUP_RAID5 |
5222                                  BTRFS_BLOCK_GROUP_DUP)) {
5223                         max_errors = 1;
5224                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5225                         max_errors = 2;
5226                 }
5227         }
5228
5229         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5230             dev_replace->tgtdev != NULL) {
5231                 int index_where_to_add;
5232                 u64 srcdev_devid = dev_replace->srcdev->devid;
5233
5234                 /*
5235                  * duplicate the write operations while the dev replace
5236                  * procedure is running. Since the copying of the old disk
5237                  * to the new disk takes place at run time while the
5238                  * filesystem is mounted writable, the regular write
5239                  * operations to the old disk have to be duplicated to go
5240                  * to the new disk as well.
5241                  * Note that device->missing is handled by the caller, and
5242                  * that the write to the old disk is already set up in the
5243                  * stripes array.
5244                  */
5245                 index_where_to_add = num_stripes;
5246                 for (i = 0; i < num_stripes; i++) {
5247                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5248                                 /* write to new disk, too */
5249                                 struct btrfs_bio_stripe *new =
5250                                         bbio->stripes + index_where_to_add;
5251                                 struct btrfs_bio_stripe *old =
5252                                         bbio->stripes + i;
5253
5254                                 new->physical = old->physical;
5255                                 new->length = old->length;
5256                                 new->dev = dev_replace->tgtdev;
5257                                 index_where_to_add++;
5258                                 max_errors++;
5259                         }
5260                 }
5261                 num_stripes = index_where_to_add;
5262         } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
5263                    dev_replace->tgtdev != NULL) {
5264                 u64 srcdev_devid = dev_replace->srcdev->devid;
5265                 int index_srcdev = 0;
5266                 int found = 0;
5267                 u64 physical_of_found = 0;
5268
5269                 /*
5270                  * During the dev-replace procedure, the target drive can
5271                  * also be used to read data in case it is needed to repair
5272                  * a corrupt block elsewhere. This is possible if the
5273                  * requested area is left of the left cursor. In this area,
5274                  * the target drive is a full copy of the source drive.
5275                  */
5276                 for (i = 0; i < num_stripes; i++) {
5277                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5278                                 /*
5279                                  * In case of DUP, in order to keep it
5280                                  * simple, only add the mirror with the
5281                                  * lowest physical address
5282                                  */
5283                                 if (found &&
5284                                     physical_of_found <=
5285                                      bbio->stripes[i].physical)
5286                                         continue;
5287                                 index_srcdev = i;
5288                                 found = 1;
5289                                 physical_of_found = bbio->stripes[i].physical;
5290                         }
5291                 }
5292                 if (found) {
5293                         u64 length = map->stripe_len;
5294
5295                         if (physical_of_found + length <=
5296                             dev_replace->cursor_left) {
5297                                 struct btrfs_bio_stripe *tgtdev_stripe =
5298                                         bbio->stripes + num_stripes;
5299
5300                                 tgtdev_stripe->physical = physical_of_found;
5301                                 tgtdev_stripe->length =
5302                                         bbio->stripes[index_srcdev].length;
5303                                 tgtdev_stripe->dev = dev_replace->tgtdev;
5304
5305                                 num_stripes++;
5306                         }
5307                 }
5308         }
5309
5310         *bbio_ret = bbio;
5311         bbio->num_stripes = num_stripes;
5312         bbio->max_errors = max_errors;
5313         bbio->mirror_num = mirror_num;
5314
5315         /*
5316          * this is the case that REQ_READ && dev_replace_is_ongoing &&
5317          * mirror_num == num_stripes + 1 && dev_replace target drive is
5318          * available as a mirror
5319          */
5320         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5321                 WARN_ON(num_stripes > 1);
5322                 bbio->stripes[0].dev = dev_replace->tgtdev;
5323                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5324                 bbio->mirror_num = map->num_stripes + 1;
5325         }
5326         if (raid_map) {
5327                 sort_parity_stripes(bbio, raid_map);
5328                 *raid_map_ret = raid_map;
5329         }
5330 out:
5331         if (dev_replace_is_ongoing)
5332                 btrfs_dev_replace_unlock(dev_replace);
5333         free_extent_map(em);
5334         return ret;
5335 }
5336
5337 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5338                       u64 logical, u64 *length,
5339                       struct btrfs_bio **bbio_ret, int mirror_num)
5340 {
5341         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5342                                  mirror_num, NULL);
5343 }
5344
5345 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5346                      u64 chunk_start, u64 physical, u64 devid,
5347                      u64 **logical, int *naddrs, int *stripe_len)
5348 {
5349         struct extent_map_tree *em_tree = &map_tree->map_tree;
5350         struct extent_map *em;
5351         struct map_lookup *map;
5352         u64 *buf;
5353         u64 bytenr;
5354         u64 length;
5355         u64 stripe_nr;
5356         u64 rmap_len;
5357         int i, j, nr = 0;
5358
5359         read_lock(&em_tree->lock);
5360         em = lookup_extent_mapping(em_tree, chunk_start, 1);
5361         read_unlock(&em_tree->lock);
5362
5363         if (!em) {
5364                 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5365                        chunk_start);
5366                 return -EIO;
5367         }
5368
5369         if (em->start != chunk_start) {
5370                 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5371                        em->start, chunk_start);
5372                 free_extent_map(em);
5373                 return -EIO;
5374         }
5375         map = (struct map_lookup *)em->bdev;
5376
5377         length = em->len;
5378         rmap_len = map->stripe_len;
5379
5380         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5381                 do_div(length, map->num_stripes / map->sub_stripes);
5382         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5383                 do_div(length, map->num_stripes);
5384         else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5385                               BTRFS_BLOCK_GROUP_RAID6)) {
5386                 do_div(length, nr_data_stripes(map));
5387                 rmap_len = map->stripe_len * nr_data_stripes(map);
5388         }
5389
5390         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
5391         BUG_ON(!buf); /* -ENOMEM */
5392
5393         for (i = 0; i < map->num_stripes; i++) {
5394                 if (devid && map->stripes[i].dev->devid != devid)
5395                         continue;
5396                 if (map->stripes[i].physical > physical ||
5397                     map->stripes[i].physical + length <= physical)
5398                         continue;
5399
5400                 stripe_nr = physical - map->stripes[i].physical;
5401                 do_div(stripe_nr, map->stripe_len);
5402
5403                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5404                         stripe_nr = stripe_nr * map->num_stripes + i;
5405                         do_div(stripe_nr, map->sub_stripes);
5406                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5407                         stripe_nr = stripe_nr * map->num_stripes + i;
5408                 } /* else if RAID[56], multiply by nr_data_stripes().
5409                    * Alternatively, just use rmap_len below instead of
5410                    * map->stripe_len */
5411
5412                 bytenr = chunk_start + stripe_nr * rmap_len;
5413                 WARN_ON(nr >= map->num_stripes);
5414                 for (j = 0; j < nr; j++) {
5415                         if (buf[j] == bytenr)
5416                                 break;
5417                 }
5418                 if (j == nr) {
5419                         WARN_ON(nr >= map->num_stripes);
5420                         buf[nr++] = bytenr;
5421                 }
5422         }
5423
5424         *logical = buf;
5425         *naddrs = nr;
5426         *stripe_len = rmap_len;
5427
5428         free_extent_map(em);
5429         return 0;
5430 }
5431
5432 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
5433 {
5434         if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
5435                 bio_endio_nodec(bio, err);
5436         else
5437                 bio_endio(bio, err);
5438         kfree(bbio);
5439 }
5440
5441 static void btrfs_end_bio(struct bio *bio, int err)
5442 {
5443         struct btrfs_bio *bbio = bio->bi_private;
5444         struct btrfs_device *dev = bbio->stripes[0].dev;
5445         int is_orig_bio = 0;
5446
5447         if (err) {
5448                 atomic_inc(&bbio->error);
5449                 if (err == -EIO || err == -EREMOTEIO) {
5450                         unsigned int stripe_index =
5451                                 btrfs_io_bio(bio)->stripe_index;
5452
5453                         BUG_ON(stripe_index >= bbio->num_stripes);
5454                         dev = bbio->stripes[stripe_index].dev;
5455                         if (dev->bdev) {
5456                                 if (bio->bi_rw & WRITE)
5457                                         btrfs_dev_stat_inc(dev,
5458                                                 BTRFS_DEV_STAT_WRITE_ERRS);
5459                                 else
5460                                         btrfs_dev_stat_inc(dev,
5461                                                 BTRFS_DEV_STAT_READ_ERRS);
5462                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
5463                                         btrfs_dev_stat_inc(dev,
5464                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
5465                                 btrfs_dev_stat_print_on_error(dev);
5466                         }
5467                 }
5468         }
5469
5470         if (bio == bbio->orig_bio)
5471                 is_orig_bio = 1;
5472
5473         btrfs_bio_counter_dec(bbio->fs_info);
5474
5475         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5476                 if (!is_orig_bio) {
5477                         bio_put(bio);
5478                         bio = bbio->orig_bio;
5479                 }
5480
5481                 bio->bi_private = bbio->private;
5482                 bio->bi_end_io = bbio->end_io;
5483                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5484                 /* only send an error to the higher layers if it is
5485                  * beyond the tolerance of the btrfs bio
5486                  */
5487                 if (atomic_read(&bbio->error) > bbio->max_errors) {
5488                         err = -EIO;
5489                 } else {
5490                         /*
5491                          * this bio is actually up to date, we didn't
5492                          * go over the max number of errors
5493                          */
5494                         set_bit(BIO_UPTODATE, &bio->bi_flags);
5495                         err = 0;
5496                 }
5497
5498                 btrfs_end_bbio(bbio, bio, err);
5499         } else if (!is_orig_bio) {
5500                 bio_put(bio);
5501         }
5502 }
5503
5504 /*
5505  * see run_scheduled_bios for a description of why bios are collected for
5506  * async submit.
5507  *
5508  * This will add one bio to the pending list for a device and make sure
5509  * the work struct is scheduled.
5510  */
5511 static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5512                                         struct btrfs_device *device,
5513                                         int rw, struct bio *bio)
5514 {
5515         int should_queue = 1;
5516         struct btrfs_pending_bios *pending_bios;
5517
5518         if (device->missing || !device->bdev) {
5519                 bio_endio(bio, -EIO);
5520                 return;
5521         }
5522
5523         /* don't bother with additional async steps for reads, right now */
5524         if (!(rw & REQ_WRITE)) {
5525                 bio_get(bio);
5526                 btrfsic_submit_bio(rw, bio);
5527                 bio_put(bio);
5528                 return;
5529         }
5530
5531         /*
5532          * nr_async_bios allows us to reliably return congestion to the
5533          * higher layers.  Otherwise, the async bio makes it appear we have
5534          * made progress against dirty pages when we've really just put it
5535          * on a queue for later
5536          */
5537         atomic_inc(&root->fs_info->nr_async_bios);
5538         WARN_ON(bio->bi_next);
5539         bio->bi_next = NULL;
5540         bio->bi_rw |= rw;
5541
5542         spin_lock(&device->io_lock);
5543         if (bio->bi_rw & REQ_SYNC)
5544                 pending_bios = &device->pending_sync_bios;
5545         else
5546                 pending_bios = &device->pending_bios;
5547
5548         if (pending_bios->tail)
5549                 pending_bios->tail->bi_next = bio;
5550
5551         pending_bios->tail = bio;
5552         if (!pending_bios->head)
5553                 pending_bios->head = bio;
5554         if (device->running_pending)
5555                 should_queue = 0;
5556
5557         spin_unlock(&device->io_lock);
5558
5559         if (should_queue)
5560                 btrfs_queue_work(root->fs_info->submit_workers,
5561                                  &device->work);
5562 }
5563
5564 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5565                        sector_t sector)
5566 {
5567         struct bio_vec *prev;
5568         struct request_queue *q = bdev_get_queue(bdev);
5569         unsigned int max_sectors = queue_max_sectors(q);
5570         struct bvec_merge_data bvm = {
5571                 .bi_bdev = bdev,
5572                 .bi_sector = sector,
5573                 .bi_rw = bio->bi_rw,
5574         };
5575
5576         if (WARN_ON(bio->bi_vcnt == 0))
5577                 return 1;
5578
5579         prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5580         if (bio_sectors(bio) > max_sectors)
5581                 return 0;
5582
5583         if (!q->merge_bvec_fn)
5584                 return 1;
5585
5586         bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
5587         if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5588                 return 0;
5589         return 1;
5590 }
5591
5592 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5593                               struct bio *bio, u64 physical, int dev_nr,
5594                               int rw, int async)
5595 {
5596         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5597
5598         bio->bi_private = bbio;
5599         btrfs_io_bio(bio)->stripe_index = dev_nr;
5600         bio->bi_end_io = btrfs_end_bio;
5601         bio->bi_iter.bi_sector = physical >> 9;
5602 #ifdef DEBUG
5603         {
5604                 struct rcu_string *name;
5605
5606                 rcu_read_lock();
5607                 name = rcu_dereference(dev->name);
5608                 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5609                          "(%s id %llu), size=%u\n", rw,
5610                          (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
5611                          name->str, dev->devid, bio->bi_size);
5612                 rcu_read_unlock();
5613         }
5614 #endif
5615         bio->bi_bdev = dev->bdev;
5616
5617         btrfs_bio_counter_inc_noblocked(root->fs_info);
5618
5619         if (async)
5620                 btrfs_schedule_bio(root, dev, rw, bio);
5621         else
5622                 btrfsic_submit_bio(rw, bio);
5623 }
5624
5625 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5626                               struct bio *first_bio, struct btrfs_device *dev,
5627                               int dev_nr, int rw, int async)
5628 {
5629         struct bio_vec *bvec = first_bio->bi_io_vec;
5630         struct bio *bio;
5631         int nr_vecs = bio_get_nr_vecs(dev->bdev);
5632         u64 physical = bbio->stripes[dev_nr].physical;
5633
5634 again:
5635         bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
5636         if (!bio)
5637                 return -ENOMEM;
5638
5639         while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5640                 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5641                                  bvec->bv_offset) < bvec->bv_len) {
5642                         u64 len = bio->bi_iter.bi_size;
5643
5644                         atomic_inc(&bbio->stripes_pending);
5645                         submit_stripe_bio(root, bbio, bio, physical, dev_nr,
5646                                           rw, async);
5647                         physical += len;
5648                         goto again;
5649                 }
5650                 bvec++;
5651         }
5652
5653         submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
5654         return 0;
5655 }
5656
5657 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5658 {
5659         atomic_inc(&bbio->error);
5660         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5661                 /* Shoud be the original bio. */
5662                 WARN_ON(bio != bbio->orig_bio);
5663
5664                 bio->bi_private = bbio->private;
5665                 bio->bi_end_io = bbio->end_io;
5666                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5667                 bio->bi_iter.bi_sector = logical >> 9;
5668
5669                 btrfs_end_bbio(bbio, bio, -EIO);
5670         }
5671 }
5672
5673 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5674                   int mirror_num, int async_submit)
5675 {
5676         struct btrfs_device *dev;
5677         struct bio *first_bio = bio;
5678         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5679         u64 length = 0;
5680         u64 map_length;
5681         u64 *raid_map = NULL;
5682         int ret;
5683         int dev_nr = 0;
5684         int total_devs = 1;
5685         struct btrfs_bio *bbio = NULL;
5686
5687         length = bio->bi_iter.bi_size;
5688         map_length = length;
5689
5690         btrfs_bio_counter_inc_blocked(root->fs_info);
5691         ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5692                               mirror_num, &raid_map);
5693         if (ret) {
5694                 btrfs_bio_counter_dec(root->fs_info);
5695                 return ret;
5696         }
5697
5698         total_devs = bbio->num_stripes;
5699         bbio->orig_bio = first_bio;
5700         bbio->private = first_bio->bi_private;
5701         bbio->end_io = first_bio->bi_end_io;
5702         bbio->fs_info = root->fs_info;
5703         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5704
5705         if (raid_map) {
5706                 /* In this case, map_length has been set to the length of
5707                    a single stripe; not the whole write */
5708                 if (rw & WRITE) {
5709                         ret = raid56_parity_write(root, bio, bbio,
5710                                                   raid_map, map_length);
5711                 } else {
5712                         ret = raid56_parity_recover(root, bio, bbio,
5713                                                     raid_map, map_length,
5714                                                     mirror_num);
5715                 }
5716                 /*
5717                  * FIXME, replace dosen't support raid56 yet, please fix
5718                  * it in the future.
5719                  */
5720                 btrfs_bio_counter_dec(root->fs_info);
5721                 return ret;
5722         }
5723
5724         if (map_length < length) {
5725                 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5726                         logical, length, map_length);
5727                 BUG();
5728         }
5729
5730         while (dev_nr < total_devs) {
5731                 dev = bbio->stripes[dev_nr].dev;
5732                 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5733                         bbio_error(bbio, first_bio, logical);
5734                         dev_nr++;
5735                         continue;
5736                 }
5737
5738                 /*
5739                  * Check and see if we're ok with this bio based on it's size
5740                  * and offset with the given device.
5741                  */
5742                 if (!bio_size_ok(dev->bdev, first_bio,
5743                                  bbio->stripes[dev_nr].physical >> 9)) {
5744                         ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5745                                                  dev_nr, rw, async_submit);
5746                         BUG_ON(ret);
5747                         dev_nr++;
5748                         continue;
5749                 }
5750
5751                 if (dev_nr < total_devs - 1) {
5752                         bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5753                         BUG_ON(!bio); /* -ENOMEM */
5754                 } else {
5755                         bio = first_bio;
5756                         bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
5757                 }
5758
5759                 submit_stripe_bio(root, bbio, bio,
5760                                   bbio->stripes[dev_nr].physical, dev_nr, rw,
5761                                   async_submit);
5762                 dev_nr++;
5763         }
5764         btrfs_bio_counter_dec(root->fs_info);
5765         return 0;
5766 }
5767
5768 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5769                                        u8 *uuid, u8 *fsid)
5770 {
5771         struct btrfs_device *device;
5772         struct btrfs_fs_devices *cur_devices;
5773
5774         cur_devices = fs_info->fs_devices;
5775         while (cur_devices) {
5776                 if (!fsid ||
5777                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5778                         device = __find_device(&cur_devices->devices,
5779                                                devid, uuid);
5780                         if (device)
5781                                 return device;
5782                 }
5783                 cur_devices = cur_devices->seed;
5784         }
5785         return NULL;
5786 }
5787
5788 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5789                                             u64 devid, u8 *dev_uuid)
5790 {
5791         struct btrfs_device *device;
5792         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5793
5794         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5795         if (IS_ERR(device))
5796                 return NULL;
5797
5798         list_add(&device->dev_list, &fs_devices->devices);
5799         device->fs_devices = fs_devices;
5800         fs_devices->num_devices++;
5801
5802         device->missing = 1;
5803         fs_devices->missing_devices++;
5804
5805         return device;
5806 }
5807
5808 /**
5809  * btrfs_alloc_device - allocate struct btrfs_device
5810  * @fs_info:    used only for generating a new devid, can be NULL if
5811  *              devid is provided (i.e. @devid != NULL).
5812  * @devid:      a pointer to devid for this device.  If NULL a new devid
5813  *              is generated.
5814  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
5815  *              is generated.
5816  *
5817  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
5818  * on error.  Returned struct is not linked onto any lists and can be
5819  * destroyed with kfree() right away.
5820  */
5821 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5822                                         const u64 *devid,
5823                                         const u8 *uuid)
5824 {
5825         struct btrfs_device *dev;
5826         u64 tmp;
5827
5828         if (WARN_ON(!devid && !fs_info))
5829                 return ERR_PTR(-EINVAL);
5830
5831         dev = __alloc_device();
5832         if (IS_ERR(dev))
5833                 return dev;
5834
5835         if (devid)
5836                 tmp = *devid;
5837         else {
5838                 int ret;
5839
5840                 ret = find_next_devid(fs_info, &tmp);
5841                 if (ret) {
5842                         kfree(dev);
5843                         return ERR_PTR(ret);
5844                 }
5845         }
5846         dev->devid = tmp;
5847
5848         if (uuid)
5849                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
5850         else
5851                 generate_random_uuid(dev->uuid);
5852
5853         btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5854
5855         return dev;
5856 }
5857
5858 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5859                           struct extent_buffer *leaf,
5860                           struct btrfs_chunk *chunk)
5861 {
5862         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5863         struct map_lookup *map;
5864         struct extent_map *em;
5865         u64 logical;
5866         u64 length;
5867         u64 devid;
5868         u8 uuid[BTRFS_UUID_SIZE];
5869         int num_stripes;
5870         int ret;
5871         int i;
5872
5873         logical = key->offset;
5874         length = btrfs_chunk_length(leaf, chunk);
5875
5876         read_lock(&map_tree->map_tree.lock);
5877         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
5878         read_unlock(&map_tree->map_tree.lock);
5879
5880         /* already mapped? */
5881         if (em && em->start <= logical && em->start + em->len > logical) {
5882                 free_extent_map(em);
5883                 return 0;
5884         } else if (em) {
5885                 free_extent_map(em);
5886         }
5887
5888         em = alloc_extent_map();
5889         if (!em)
5890                 return -ENOMEM;
5891         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
5892         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
5893         if (!map) {
5894                 free_extent_map(em);
5895                 return -ENOMEM;
5896         }
5897
5898         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5899         em->bdev = (struct block_device *)map;
5900         em->start = logical;
5901         em->len = length;
5902         em->orig_start = 0;
5903         em->block_start = 0;
5904         em->block_len = em->len;
5905
5906         map->num_stripes = num_stripes;
5907         map->io_width = btrfs_chunk_io_width(leaf, chunk);
5908         map->io_align = btrfs_chunk_io_align(leaf, chunk);
5909         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
5910         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
5911         map->type = btrfs_chunk_type(leaf, chunk);
5912         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
5913         for (i = 0; i < num_stripes; i++) {
5914                 map->stripes[i].physical =
5915                         btrfs_stripe_offset_nr(leaf, chunk, i);
5916                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
5917                 read_extent_buffer(leaf, uuid, (unsigned long)
5918                                    btrfs_stripe_dev_uuid_nr(chunk, i),
5919                                    BTRFS_UUID_SIZE);
5920                 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
5921                                                         uuid, NULL);
5922                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
5923                         free_extent_map(em);
5924                         return -EIO;
5925                 }
5926                 if (!map->stripes[i].dev) {
5927                         map->stripes[i].dev =
5928                                 add_missing_dev(root, devid, uuid);
5929                         if (!map->stripes[i].dev) {
5930                                 free_extent_map(em);
5931                                 return -EIO;
5932                         }
5933                 }
5934                 map->stripes[i].dev->in_fs_metadata = 1;
5935         }
5936
5937         write_lock(&map_tree->map_tree.lock);
5938         ret = add_extent_mapping(&map_tree->map_tree, em, 0);
5939         write_unlock(&map_tree->map_tree.lock);
5940         BUG_ON(ret); /* Tree corruption */
5941         free_extent_map(em);
5942
5943         return 0;
5944 }
5945
5946 static void fill_device_from_item(struct extent_buffer *leaf,
5947                                  struct btrfs_dev_item *dev_item,
5948                                  struct btrfs_device *device)
5949 {
5950         unsigned long ptr;
5951
5952         device->devid = btrfs_device_id(leaf, dev_item);
5953         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5954         device->total_bytes = device->disk_total_bytes;
5955         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
5956         device->type = btrfs_device_type(leaf, dev_item);
5957         device->io_align = btrfs_device_io_align(leaf, dev_item);
5958         device->io_width = btrfs_device_io_width(leaf, dev_item);
5959         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5960         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5961         device->is_tgtdev_for_dev_replace = 0;
5962
5963         ptr = btrfs_device_uuid(dev_item);
5964         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5965 }
5966
5967 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5968 {
5969         struct btrfs_fs_devices *fs_devices;
5970         int ret;
5971
5972         BUG_ON(!mutex_is_locked(&uuid_mutex));
5973
5974         fs_devices = root->fs_info->fs_devices->seed;
5975         while (fs_devices) {
5976                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5977                         ret = 0;
5978                         goto out;
5979                 }
5980                 fs_devices = fs_devices->seed;
5981         }
5982
5983         fs_devices = find_fsid(fsid);
5984         if (!fs_devices) {
5985                 ret = -ENOENT;
5986                 goto out;
5987         }
5988
5989         fs_devices = clone_fs_devices(fs_devices);
5990         if (IS_ERR(fs_devices)) {
5991                 ret = PTR_ERR(fs_devices);
5992                 goto out;
5993         }
5994
5995         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
5996                                    root->fs_info->bdev_holder);
5997         if (ret) {
5998                 free_fs_devices(fs_devices);
5999                 goto out;
6000         }
6001
6002         if (!fs_devices->seeding) {
6003                 __btrfs_close_devices(fs_devices);
6004                 free_fs_devices(fs_devices);
6005                 ret = -EINVAL;
6006                 goto out;
6007         }
6008
6009         fs_devices->seed = root->fs_info->fs_devices->seed;
6010         root->fs_info->fs_devices->seed = fs_devices;
6011 out:
6012         return ret;
6013 }
6014
6015 static int read_one_dev(struct btrfs_root *root,
6016                         struct extent_buffer *leaf,
6017                         struct btrfs_dev_item *dev_item)
6018 {
6019         struct btrfs_device *device;
6020         u64 devid;
6021         int ret;
6022         u8 fs_uuid[BTRFS_UUID_SIZE];
6023         u8 dev_uuid[BTRFS_UUID_SIZE];
6024
6025         devid = btrfs_device_id(leaf, dev_item);
6026         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6027                            BTRFS_UUID_SIZE);
6028         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6029                            BTRFS_UUID_SIZE);
6030
6031         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6032                 ret = open_seed_devices(root, fs_uuid);
6033                 if (ret && !btrfs_test_opt(root, DEGRADED))
6034                         return ret;
6035         }
6036
6037         device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6038         if (!device || !device->bdev) {
6039                 if (!btrfs_test_opt(root, DEGRADED))
6040                         return -EIO;
6041
6042                 if (!device) {
6043                         btrfs_warn(root->fs_info, "devid %llu missing", devid);
6044                         device = add_missing_dev(root, devid, dev_uuid);
6045                         if (!device)
6046                                 return -ENOMEM;
6047                 } else if (!device->missing) {
6048                         /*
6049                          * this happens when a device that was properly setup
6050                          * in the device info lists suddenly goes bad.
6051                          * device->bdev is NULL, and so we have to set
6052                          * device->missing to one here
6053                          */
6054                         root->fs_info->fs_devices->missing_devices++;
6055                         device->missing = 1;
6056                 }
6057         }
6058
6059         if (device->fs_devices != root->fs_info->fs_devices) {
6060                 BUG_ON(device->writeable);
6061                 if (device->generation !=
6062                     btrfs_device_generation(leaf, dev_item))
6063                         return -EINVAL;
6064         }
6065
6066         fill_device_from_item(leaf, dev_item, device);
6067         device->in_fs_metadata = 1;
6068         if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6069                 device->fs_devices->total_rw_bytes += device->total_bytes;
6070                 spin_lock(&root->fs_info->free_chunk_lock);
6071                 root->fs_info->free_chunk_space += device->total_bytes -
6072                         device->bytes_used;
6073                 spin_unlock(&root->fs_info->free_chunk_lock);
6074         }
6075         ret = 0;
6076         return ret;
6077 }
6078
6079 int btrfs_read_sys_array(struct btrfs_root *root)
6080 {
6081         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
6082         struct extent_buffer *sb;
6083         struct btrfs_disk_key *disk_key;
6084         struct btrfs_chunk *chunk;
6085         u8 *ptr;
6086         unsigned long sb_ptr;
6087         int ret = 0;
6088         u32 num_stripes;
6089         u32 array_size;
6090         u32 len = 0;
6091         u32 cur;
6092         struct btrfs_key key;
6093
6094         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
6095                                           BTRFS_SUPER_INFO_SIZE);
6096         if (!sb)
6097                 return -ENOMEM;
6098         btrfs_set_buffer_uptodate(sb);
6099         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6100         /*
6101          * The sb extent buffer is artifical and just used to read the system array.
6102          * btrfs_set_buffer_uptodate() call does not properly mark all it's
6103          * pages up-to-date when the page is larger: extent does not cover the
6104          * whole page and consequently check_page_uptodate does not find all
6105          * the page's extents up-to-date (the hole beyond sb),
6106          * write_extent_buffer then triggers a WARN_ON.
6107          *
6108          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6109          * but sb spans only this function. Add an explicit SetPageUptodate call
6110          * to silence the warning eg. on PowerPC 64.
6111          */
6112         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
6113                 SetPageUptodate(sb->pages[0]);
6114
6115         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6116         array_size = btrfs_super_sys_array_size(super_copy);
6117
6118         ptr = super_copy->sys_chunk_array;
6119         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
6120         cur = 0;
6121
6122         while (cur < array_size) {
6123                 disk_key = (struct btrfs_disk_key *)ptr;
6124                 btrfs_disk_key_to_cpu(&key, disk_key);
6125
6126                 len = sizeof(*disk_key); ptr += len;
6127                 sb_ptr += len;
6128                 cur += len;
6129
6130                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6131                         chunk = (struct btrfs_chunk *)sb_ptr;
6132                         ret = read_one_chunk(root, &key, sb, chunk);
6133                         if (ret)
6134                                 break;
6135                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6136                         len = btrfs_chunk_item_size(num_stripes);
6137                 } else {
6138                         ret = -EIO;
6139                         break;
6140                 }
6141                 ptr += len;
6142                 sb_ptr += len;
6143                 cur += len;
6144         }
6145         free_extent_buffer(sb);
6146         return ret;
6147 }
6148
6149 int btrfs_read_chunk_tree(struct btrfs_root *root)
6150 {
6151         struct btrfs_path *path;
6152         struct extent_buffer *leaf;
6153         struct btrfs_key key;
6154         struct btrfs_key found_key;
6155         int ret;
6156         int slot;
6157
6158         root = root->fs_info->chunk_root;
6159
6160         path = btrfs_alloc_path();
6161         if (!path)
6162                 return -ENOMEM;
6163
6164         mutex_lock(&uuid_mutex);
6165         lock_chunks(root);
6166
6167         /*
6168          * Read all device items, and then all the chunk items. All
6169          * device items are found before any chunk item (their object id
6170          * is smaller than the lowest possible object id for a chunk
6171          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6172          */
6173         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6174         key.offset = 0;
6175         key.type = 0;
6176         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6177         if (ret < 0)
6178                 goto error;
6179         while (1) {
6180                 leaf = path->nodes[0];
6181                 slot = path->slots[0];
6182                 if (slot >= btrfs_header_nritems(leaf)) {
6183                         ret = btrfs_next_leaf(root, path);
6184                         if (ret == 0)
6185                                 continue;
6186                         if (ret < 0)
6187                                 goto error;
6188                         break;
6189                 }
6190                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6191                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6192                         struct btrfs_dev_item *dev_item;
6193                         dev_item = btrfs_item_ptr(leaf, slot,
6194                                                   struct btrfs_dev_item);
6195                         ret = read_one_dev(root, leaf, dev_item);
6196                         if (ret)
6197                                 goto error;
6198                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6199                         struct btrfs_chunk *chunk;
6200                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6201                         ret = read_one_chunk(root, &found_key, leaf, chunk);
6202                         if (ret)
6203                                 goto error;
6204                 }
6205                 path->slots[0]++;
6206         }
6207         ret = 0;
6208 error:
6209         unlock_chunks(root);
6210         mutex_unlock(&uuid_mutex);
6211
6212         btrfs_free_path(path);
6213         return ret;
6214 }
6215
6216 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6217 {
6218         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6219         struct btrfs_device *device;
6220
6221         while (fs_devices) {
6222                 mutex_lock(&fs_devices->device_list_mutex);
6223                 list_for_each_entry(device, &fs_devices->devices, dev_list)
6224                         device->dev_root = fs_info->dev_root;
6225                 mutex_unlock(&fs_devices->device_list_mutex);
6226
6227                 fs_devices = fs_devices->seed;
6228         }
6229 }
6230
6231 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6232 {
6233         int i;
6234
6235         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6236                 btrfs_dev_stat_reset(dev, i);
6237 }
6238
6239 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6240 {
6241         struct btrfs_key key;
6242         struct btrfs_key found_key;
6243         struct btrfs_root *dev_root = fs_info->dev_root;
6244         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6245         struct extent_buffer *eb;
6246         int slot;
6247         int ret = 0;
6248         struct btrfs_device *device;
6249         struct btrfs_path *path = NULL;
6250         int i;
6251
6252         path = btrfs_alloc_path();
6253         if (!path) {
6254                 ret = -ENOMEM;
6255                 goto out;
6256         }
6257
6258         mutex_lock(&fs_devices->device_list_mutex);
6259         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6260                 int item_size;
6261                 struct btrfs_dev_stats_item *ptr;
6262
6263                 key.objectid = 0;
6264                 key.type = BTRFS_DEV_STATS_KEY;
6265                 key.offset = device->devid;
6266                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6267                 if (ret) {
6268                         __btrfs_reset_dev_stats(device);
6269                         device->dev_stats_valid = 1;
6270                         btrfs_release_path(path);
6271                         continue;
6272                 }
6273                 slot = path->slots[0];
6274                 eb = path->nodes[0];
6275                 btrfs_item_key_to_cpu(eb, &found_key, slot);
6276                 item_size = btrfs_item_size_nr(eb, slot);
6277
6278                 ptr = btrfs_item_ptr(eb, slot,
6279                                      struct btrfs_dev_stats_item);
6280
6281                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6282                         if (item_size >= (1 + i) * sizeof(__le64))
6283                                 btrfs_dev_stat_set(device, i,
6284                                         btrfs_dev_stats_value(eb, ptr, i));
6285                         else
6286                                 btrfs_dev_stat_reset(device, i);
6287                 }
6288
6289                 device->dev_stats_valid = 1;
6290                 btrfs_dev_stat_print_on_load(device);
6291                 btrfs_release_path(path);
6292         }
6293         mutex_unlock(&fs_devices->device_list_mutex);
6294
6295 out:
6296         btrfs_free_path(path);
6297         return ret < 0 ? ret : 0;
6298 }
6299
6300 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6301                                 struct btrfs_root *dev_root,
6302                                 struct btrfs_device *device)
6303 {
6304         struct btrfs_path *path;
6305         struct btrfs_key key;
6306         struct extent_buffer *eb;
6307         struct btrfs_dev_stats_item *ptr;
6308         int ret;
6309         int i;
6310
6311         key.objectid = 0;
6312         key.type = BTRFS_DEV_STATS_KEY;
6313         key.offset = device->devid;
6314
6315         path = btrfs_alloc_path();
6316         BUG_ON(!path);
6317         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6318         if (ret < 0) {
6319                 printk_in_rcu(KERN_WARNING "BTRFS: "
6320                         "error %d while searching for dev_stats item for device %s!\n",
6321                               ret, rcu_str_deref(device->name));
6322                 goto out;
6323         }
6324
6325         if (ret == 0 &&
6326             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
6327                 /* need to delete old one and insert a new one */
6328                 ret = btrfs_del_item(trans, dev_root, path);
6329                 if (ret != 0) {
6330                         printk_in_rcu(KERN_WARNING "BTRFS: "
6331                                 "delete too small dev_stats item for device %s failed %d!\n",
6332                                       rcu_str_deref(device->name), ret);
6333                         goto out;
6334                 }
6335                 ret = 1;
6336         }
6337
6338         if (ret == 1) {
6339                 /* need to insert a new item */
6340                 btrfs_release_path(path);
6341                 ret = btrfs_insert_empty_item(trans, dev_root, path,
6342                                               &key, sizeof(*ptr));
6343                 if (ret < 0) {
6344                         printk_in_rcu(KERN_WARNING "BTRFS: "
6345                                           "insert dev_stats item for device %s failed %d!\n",
6346                                       rcu_str_deref(device->name), ret);
6347                         goto out;
6348                 }
6349         }
6350
6351         eb = path->nodes[0];
6352         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
6353         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6354                 btrfs_set_dev_stats_value(eb, ptr, i,
6355                                           btrfs_dev_stat_read(device, i));
6356         btrfs_mark_buffer_dirty(eb);
6357
6358 out:
6359         btrfs_free_path(path);
6360         return ret;
6361 }
6362
6363 /*
6364  * called from commit_transaction. Writes all changed device stats to disk.
6365  */
6366 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6367                         struct btrfs_fs_info *fs_info)
6368 {
6369         struct btrfs_root *dev_root = fs_info->dev_root;
6370         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6371         struct btrfs_device *device;
6372         int ret = 0;
6373
6374         mutex_lock(&fs_devices->device_list_mutex);
6375         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6376                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
6377                         continue;
6378
6379                 ret = update_dev_stat_item(trans, dev_root, device);
6380                 if (!ret)
6381                         device->dev_stats_dirty = 0;
6382         }
6383         mutex_unlock(&fs_devices->device_list_mutex);
6384
6385         return ret;
6386 }
6387
6388 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
6389 {
6390         btrfs_dev_stat_inc(dev, index);
6391         btrfs_dev_stat_print_on_error(dev);
6392 }
6393
6394 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6395 {
6396         if (!dev->dev_stats_valid)
6397                 return;
6398         printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
6399                            "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6400                            rcu_str_deref(dev->name),
6401                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6402                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6403                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6404                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6405                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6406 }
6407
6408 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6409 {
6410         int i;
6411
6412         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6413                 if (btrfs_dev_stat_read(dev, i) != 0)
6414                         break;
6415         if (i == BTRFS_DEV_STAT_VALUES_MAX)
6416                 return; /* all values == 0, suppress message */
6417
6418         printk_in_rcu(KERN_INFO "BTRFS: "
6419                    "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6420                rcu_str_deref(dev->name),
6421                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6422                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6423                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6424                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6425                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6426 }
6427
6428 int btrfs_get_dev_stats(struct btrfs_root *root,
6429                         struct btrfs_ioctl_get_dev_stats *stats)
6430 {
6431         struct btrfs_device *dev;
6432         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6433         int i;
6434
6435         mutex_lock(&fs_devices->device_list_mutex);
6436         dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
6437         mutex_unlock(&fs_devices->device_list_mutex);
6438
6439         if (!dev) {
6440                 btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6441                 return -ENODEV;
6442         } else if (!dev->dev_stats_valid) {
6443                 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6444                 return -ENODEV;
6445         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6446                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6447                         if (stats->nr_items > i)
6448                                 stats->values[i] =
6449                                         btrfs_dev_stat_read_and_reset(dev, i);
6450                         else
6451                                 btrfs_dev_stat_reset(dev, i);
6452                 }
6453         } else {
6454                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6455                         if (stats->nr_items > i)
6456                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
6457         }
6458         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
6459                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
6460         return 0;
6461 }
6462
6463 int btrfs_scratch_superblock(struct btrfs_device *device)
6464 {
6465         struct buffer_head *bh;
6466         struct btrfs_super_block *disk_super;
6467
6468         bh = btrfs_read_dev_super(device->bdev);
6469         if (!bh)
6470                 return -EINVAL;
6471         disk_super = (struct btrfs_super_block *)bh->b_data;
6472
6473         memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6474         set_buffer_dirty(bh);
6475         sync_dirty_buffer(bh);
6476         brelse(bh);
6477
6478         return 0;
6479 }