drivers/md/md-cluster.c

   1 /*
   2  * Copyright (C) 2015, SUSE
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2, or (at your option)
   7  * any later version.
   8  *
   9  */
  10
  11
  12 #include <linux/module.h>
  13 #include <linux/dlm.h>
  14 #include <linux/sched.h>
  15 #include <linux/raid/md_p.h>
  16 #include "md.h"
  17 #include "bitmap.h"
  18 #include "md-cluster.h"
  19
  20 #define LVB_SIZE        64
  21 #define NEW_DEV_TIMEOUT 5000
  22
  23 struct dlm_lock_resource {
  24         dlm_lockspace_t *ls;
  25         struct dlm_lksb lksb;
  26         char *name; /* lock name. */
  27         uint32_t flags; /* flags to pass to dlm_lock() */
  28         struct completion completion; /* completion for synchronized locking */
  29         void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  30         struct mddev *mddev; /* pointing back to mddev. */
  31 };
  32
  33 struct suspend_info {
  34         int slot;
  35         sector_t lo;
  36         sector_t hi;
  37         struct list_head list;
  38 };
  39
  40 struct resync_info {
  41         __le64 lo;
  42         __le64 hi;
  43 };
  44
  45 /* md_cluster_info flags */
  46 #define         MD_CLUSTER_WAITING_FOR_NEWDISK          1
  47 #define         MD_CLUSTER_SUSPEND_READ_BALANCING       2
  48 #define         MD_CLUSTER_BEGIN_JOIN_CLUSTER           3
  49
  50
  51 struct md_cluster_info {
  52         /* dlm lock space and resources for clustered raid. */
  53         dlm_lockspace_t *lockspace;
  54         int slot_number;
  55         struct completion completion;
  56         struct mutex sb_mutex;
  57         struct dlm_lock_resource *bitmap_lockres;
  58         struct list_head suspend_list;
  59         spinlock_t suspend_lock;
  60         struct md_thread *recovery_thread;
  61         unsigned long recovery_map;
  62         /* communication loc resources */
  63         struct dlm_lock_resource *ack_lockres;
  64         struct dlm_lock_resource *message_lockres;
  65         struct dlm_lock_resource *token_lockres;
  66         struct dlm_lock_resource *no_new_dev_lockres;
  67         struct md_thread *recv_thread;
  68         struct completion newdisk_completion;
  69         unsigned long state;
  70 };
  71
  72 enum msg_type {
  73         METADATA_UPDATED = 0,
  74         RESYNCING,
  75         NEWDISK,
  76         REMOVE,
  77         RE_ADD,
  78         BITMAP_NEEDS_SYNC,
  79 };
  80
  81 struct cluster_msg {
  82         int type;
  83         int slot;
  84         /* TODO: Unionize this for smaller footprint */
  85         sector_t low;
  86         sector_t high;
  87         char uuid[16];
  88         int raid_slot;
  89 };
  90
  91 static void sync_ast(void *arg)
  92 {
  93         struct dlm_lock_resource *res;
  94
  95         res = (struct dlm_lock_resource *) arg;
  96         complete(&res->completion);
  97 }
  98
  99 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 100 {
 101         int ret = 0;
 102
 103         ret = dlm_lock(res->ls, mode, &res->lksb,
 104                         res->flags, res->name, strlen(res->name),
 105                         0, sync_ast, res, res->bast);
 106         if (ret)
 107                 return ret;
 108         wait_for_completion(&res->completion);
 109         return res->lksb.sb_status;
 110 }
 111
 112 static int dlm_unlock_sync(struct dlm_lock_resource *res)
 113 {
 114         return dlm_lock_sync(res, DLM_LOCK_NL);
 115 }
 116
 117 static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 118                 char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
 119 {
 120         struct dlm_lock_resource *res = NULL;
 121         int ret, namelen;
 122         struct md_cluster_info *cinfo = mddev->cluster_info;
 123
 124         res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 125         if (!res)
 126                 return NULL;
 127         init_completion(&res->completion);
 128         res->ls = cinfo->lockspace;
 129         res->mddev = mddev;
 130         namelen = strlen(name);
 131         res->name = kzalloc(namelen + 1, GFP_KERNEL);
 132         if (!res->name) {
 133                 pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
 134                 goto out_err;
 135         }
 136         strlcpy(res->name, name, namelen + 1);
 137         if (with_lvb) {
 138                 res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
 139                 if (!res->lksb.sb_lvbptr) {
 140                         pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
 141                         goto out_err;
 142                 }
 143                 res->flags = DLM_LKF_VALBLK;
 144         }
 145
 146         if (bastfn)
 147                 res->bast = bastfn;
 148
 149         res->flags |= DLM_LKF_EXPEDITE;
 150
 151         ret = dlm_lock_sync(res, DLM_LOCK_NL);
 152         if (ret) {
 153                 pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
 154                 goto out_err;
 155         }
 156         res->flags &= ~DLM_LKF_EXPEDITE;
 157         res->flags |= DLM_LKF_CONVERT;
 158
 159         return res;
 160 out_err:
 161         kfree(res->lksb.sb_lvbptr);
 162         kfree(res->name);
 163         kfree(res);
 164         return NULL;
 165 }
 166
 167 static void lockres_free(struct dlm_lock_resource *res)
 168 {
 169         int ret;
 170
 171         if (!res)
 172                 return;
 173
 174         /* cancel a lock request or a conversion request that is blocked */
 175         res->flags |= DLM_LKF_CANCEL;
 176 retry:
 177         ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
 178         if (unlikely(ret != 0)) {
 179                 pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
 180
 181                 /* if a lock conversion is cancelled, then the lock is put
 182                  * back to grant queue, need to ensure it is unlocked */
 183                 if (ret == -DLM_ECANCEL)
 184                         goto retry;
 185         }
 186         res->flags &= ~DLM_LKF_CANCEL;
 187         wait_for_completion(&res->completion);
 188
 189         kfree(res->name);
 190         kfree(res->lksb.sb_lvbptr);
 191         kfree(res);
 192 }
 193
 194 static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
 195                 sector_t lo, sector_t hi)
 196 {
 197         struct resync_info *ri;
 198
 199         ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
 200         ri->lo = cpu_to_le64(lo);
 201         ri->hi = cpu_to_le64(hi);
 202 }
 203
 204 static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
 205 {
 206         struct resync_info ri;
 207         struct suspend_info *s = NULL;
 208         sector_t hi = 0;
 209
 210         dlm_lock_sync(lockres, DLM_LOCK_CR);
 211         memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
 212         hi = le64_to_cpu(ri.hi);
 213         if (ri.hi > 0) {
 214                 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 215                 if (!s)
 216                         goto out;
 217                 s->hi = hi;
 218                 s->lo = le64_to_cpu(ri.lo);
 219         }
 220         dlm_unlock_sync(lockres);
 221 out:
 222         return s;
 223 }
 224
 225 static void recover_bitmaps(struct md_thread *thread)
 226 {
 227         struct mddev *mddev = thread->mddev;
 228         struct md_cluster_info *cinfo = mddev->cluster_info;
 229         struct dlm_lock_resource *bm_lockres;
 230         char str[64];
 231         int slot, ret;
 232         struct suspend_info *s, *tmp;
 233         sector_t lo, hi;
 234
 235         while (cinfo->recovery_map) {
 236                 slot = fls64((u64)cinfo->recovery_map) - 1;
 237
 238                 /* Clear suspend_area associated with the bitmap */
 239                 spin_lock_irq(&cinfo->suspend_lock);
 240                 list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 241                         if (slot == s->slot) {
 242                                 list_del(&s->list);
 243                                 kfree(s);
 244                         }
 245                 spin_unlock_irq(&cinfo->suspend_lock);
 246
 247                 snprintf(str, 64, "bitmap%04d", slot);
 248                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 249                 if (!bm_lockres) {
 250                         pr_err("md-cluster: Cannot initialize bitmaps\n");
 251                         goto clear_bit;
 252                 }
 253
 254                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 255                 if (ret) {
 256                         pr_err("md-cluster: Could not DLM lock %s: %d\n",
 257                                         str, ret);
 258                         goto clear_bit;
 259                 }
 260                 ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 261                 if (ret) {
 262                         pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
 263                         goto dlm_unlock;
 264                 }
 265                 if (hi > 0) {
 266                         /* TODO:Wait for current resync to get over */
 267                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 268                         if (lo < mddev->recovery_cp)
 269                                 mddev->recovery_cp = lo;
 270                         md_check_recovery(mddev);
 271                 }
 272 dlm_unlock:
 273                 dlm_unlock_sync(bm_lockres);
 274 clear_bit:
 275                 clear_bit(slot, &cinfo->recovery_map);
 276         }
 277 }
 278
 279 static void recover_prep(void *arg)
 280 {
 281         struct mddev *mddev = arg;
 282         struct md_cluster_info *cinfo = mddev->cluster_info;
 283         set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 284 }
 285
 286 static void __recover_slot(struct mddev *mddev, int slot)
 287 {
 288         struct md_cluster_info *cinfo = mddev->cluster_info;
 289
 290         set_bit(slot, &cinfo->recovery_map);
 291         if (!cinfo->recovery_thread) {
 292                 cinfo->recovery_thread = md_register_thread(recover_bitmaps,
 293                                 mddev, "recover");
 294                 if (!cinfo->recovery_thread) {
 295                         pr_warn("md-cluster: Could not create recovery thread\n");
 296                         return;
 297                 }
 298         }
 299         md_wakeup_thread(cinfo->recovery_thread);
 300 }
 301
 302 static void recover_slot(void *arg, struct dlm_slot *slot)
 303 {
 304         struct mddev *mddev = arg;
 305         struct md_cluster_info *cinfo = mddev->cluster_info;
 306
 307         pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
 308                         mddev->bitmap_info.cluster_name,
 309                         slot->nodeid, slot->slot,
 310                         cinfo->slot_number);
 311         /* deduct one since dlm slot starts from one while the num of
 312          * cluster-md begins with 0 */
 313         __recover_slot(mddev, slot->slot - 1);
 314 }
 315
 316 static void recover_done(void *arg, struct dlm_slot *slots,
 317                 int num_slots, int our_slot,
 318                 uint32_t generation)
 319 {
 320         struct mddev *mddev = arg;
 321         struct md_cluster_info *cinfo = mddev->cluster_info;
 322
 323         cinfo->slot_number = our_slot;
 324         /* completion is only need to be complete when node join cluster,
 325          * it doesn't need to run during another node's failure */
 326         if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
 327                 complete(&cinfo->completion);
 328                 clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 329         }
 330         clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 331 }
 332
 333 /* the ops is called when node join the cluster, and do lock recovery
 334  * if node failure occurs */
 335 static const struct dlm_lockspace_ops md_ls_ops = {
 336         .recover_prep = recover_prep,
 337         .recover_slot = recover_slot,
 338         .recover_done = recover_done,
 339 };
 340
 341 /*
 342  * The BAST function for the ack lock resource
 343  * This function wakes up the receive thread in
 344  * order to receive and process the message.
 345  */
 346 static void ack_bast(void *arg, int mode)
 347 {
 348         struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
 349         struct md_cluster_info *cinfo = res->mddev->cluster_info;
 350
 351         if (mode == DLM_LOCK_EX)
 352                 md_wakeup_thread(cinfo->recv_thread);
 353 }
 354
 355 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 356 {
 357         struct suspend_info *s, *tmp;
 358
 359         list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 360                 if (slot == s->slot) {
 361                         pr_info("%s:%d Deleting suspend_info: %d\n",
 362                                         __func__, __LINE__, slot);
 363                         list_del(&s->list);
 364                         kfree(s);
 365                         break;
 366                 }
 367 }
 368
 369 static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 370 {
 371         spin_lock_irq(&cinfo->suspend_lock);
 372         __remove_suspend_info(cinfo, slot);
 373         spin_unlock_irq(&cinfo->suspend_lock);
 374 }
 375
 376
 377 static void process_suspend_info(struct md_cluster_info *cinfo,
 378                 int slot, sector_t lo, sector_t hi)
 379 {
 380         struct suspend_info *s;
 381
 382         if (!hi) {
 383                 remove_suspend_info(cinfo, slot);
 384                 return;
 385         }
 386         s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 387         if (!s)
 388                 return;
 389         s->slot = slot;
 390         s->lo = lo;
 391         s->hi = hi;
 392         spin_lock_irq(&cinfo->suspend_lock);
 393         /* Remove existing entry (if exists) before adding */
 394         __remove_suspend_info(cinfo, slot);
 395         list_add(&s->list, &cinfo->suspend_list);
 396         spin_unlock_irq(&cinfo->suspend_lock);
 397 }
 398
 399 static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 400 {
 401         char disk_uuid[64];
 402         struct md_cluster_info *cinfo = mddev->cluster_info;
 403         char event_name[] = "EVENT=ADD_DEVICE";
 404         char raid_slot[16];
 405         char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
 406         int len;
 407
 408         len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
 409         sprintf(disk_uuid + len, "%pU", cmsg->uuid);
 410         snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
 411         pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 412         init_completion(&cinfo->newdisk_completion);
 413         set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 414         kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
 415         wait_for_completion_timeout(&cinfo->newdisk_completion,
 416                         NEW_DEV_TIMEOUT);
 417         clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 418 }
 419
 420
 421 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 422 {
 423         struct md_cluster_info *cinfo = mddev->cluster_info;
 424
 425         md_reload_sb(mddev);
 426         dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 427 }
 428
 429 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 430 {
 431         struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
 432
 433         if (rdev)
 434                 md_kick_rdev_from_array(rdev);
 435         else
 436                 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
 437 }
 438
 439 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 440 {
 441         struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
 442
 443         if (rdev && test_bit(Faulty, &rdev->flags))
 444                 clear_bit(Faulty, &rdev->flags);
 445         else
 446                 pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
 447 }
 448
 449 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 450 {
 451         switch (msg->type) {
 452         case METADATA_UPDATED:
 453                 pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
 454                         __func__, __LINE__, msg->slot);
 455                 process_metadata_update(mddev, msg);
 456                 break;
 457         case RESYNCING:
 458                 pr_info("%s: %d Received message: RESYNCING from %d\n",
 459                         __func__, __LINE__, msg->slot);
 460                 process_suspend_info(mddev->cluster_info, msg->slot,
 461                                 msg->low, msg->high);
 462                 break;
 463         case NEWDISK:
 464                 pr_info("%s: %d Received message: NEWDISK from %d\n",
 465                         __func__, __LINE__, msg->slot);
 466                 process_add_new_disk(mddev, msg);
 467                 break;
 468         case REMOVE:
 469                 pr_info("%s: %d Received REMOVE from %d\n",
 470                         __func__, __LINE__, msg->slot);
 471                 process_remove_disk(mddev, msg);
 472                 break;
 473         case RE_ADD:
 474                 pr_info("%s: %d Received RE_ADD from %d\n",
 475                         __func__, __LINE__, msg->slot);
 476                 process_readd_disk(mddev, msg);
 477                 break;
 478         case BITMAP_NEEDS_SYNC:
 479                 pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
 480                         __func__, __LINE__, msg->slot);
 481                 __recover_slot(mddev, msg->slot);
 482                 break;
 483         default:
 484                 pr_warn("%s:%d Received unknown message from %d\n",
 485                         __func__, __LINE__, msg->slot);
 486         }
 487 }
 488
 489 /*
 490  * thread for receiving message
 491  */
 492 static void recv_daemon(struct md_thread *thread)
 493 {
 494         struct md_cluster_info *cinfo = thread->mddev->cluster_info;
 495         struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
 496         struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
 497         struct cluster_msg msg;
 498         int ret;
 499
 500         /*get CR on Message*/
 501         if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 502                 pr_err("md/raid1:failed to get CR on MESSAGE\n");
 503                 return;
 504         }
 505
 506         /* read lvb and wake up thread to process this message_lockres */
 507         memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
 508         process_recvd_msg(thread->mddev, &msg);
 509
 510         /*release CR on ack_lockres*/
 511         ret = dlm_unlock_sync(ack_lockres);
 512         if (unlikely(ret != 0))
 513                 pr_info("unlock ack failed return %d\n", ret);
 514         /*up-convert to PR on message_lockres*/
 515         ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
 516         if (unlikely(ret != 0))
 517                 pr_info("lock PR on msg failed return %d\n", ret);
 518         /*get CR on ack_lockres again*/
 519         ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
 520         if (unlikely(ret != 0))
 521                 pr_info("lock CR on ack failed return %d\n", ret);
 522         /*release CR on message_lockres*/
 523         ret = dlm_unlock_sync(message_lockres);
 524         if (unlikely(ret != 0))
 525                 pr_info("unlock msg failed return %d\n", ret);
 526 }
 527
 528 /* lock_comm()
 529  * Takes the lock on the TOKEN lock resource so no other
 530  * node can communicate while the operation is underway.
 531  */
 532 static int lock_comm(struct md_cluster_info *cinfo)
 533 {
 534         int error;
 535
 536         error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 537         if (error)
 538                 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 539                                 __func__, __LINE__, error);
 540         return error;
 541 }
 542
 543 static void unlock_comm(struct md_cluster_info *cinfo)
 544 {
 545         dlm_unlock_sync(cinfo->token_lockres);
 546 }
 547
 548 /* __sendmsg()
 549  * This function performs the actual sending of the message. This function is
 550  * usually called after performing the encompassing operation
 551  * The function:
 552  * 1. Grabs the message lockresource in EX mode
 553  * 2. Copies the message to the message LVB
 554  * 3. Downconverts message lockresource to CW
 555  * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
 556  *    and the other nodes read the message. The thread will wait here until all other
 557  *    nodes have released ack lock resource.
 558  * 5. Downconvert ack lockresource to CR
 559  */
 560 static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 561 {
 562         int error;
 563         int slot = cinfo->slot_number - 1;
 564
 565         cmsg->slot = cpu_to_le32(slot);
 566         /*get EX on Message*/
 567         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
 568         if (error) {
 569                 pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
 570                 goto failed_message;
 571         }
 572
 573         memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
 574                         sizeof(struct cluster_msg));
 575         /*down-convert EX to CW on Message*/
 576         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
 577         if (error) {
 578                 pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
 579                                 error);
 580                 goto failed_ack;
 581         }
 582
 583         /*up-convert CR to EX on Ack*/
 584         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
 585         if (error) {
 586                 pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
 587                                 error);
 588                 goto failed_ack;
 589         }
 590
 591         /*down-convert EX to CR on Ack*/
 592         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
 593         if (error) {
 594                 pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
 595                                 error);
 596                 goto failed_ack;
 597         }
 598
 599 failed_ack:
 600         error = dlm_unlock_sync(cinfo->message_lockres);
 601         if (unlikely(error != 0)) {
 602                 pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
 603                         error);
 604                 /* in case the message can't be released due to some reason */
 605                 goto failed_ack;
 606         }
 607 failed_message:
 608         return error;
 609 }
 610
 611 static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 612 {
 613         int ret;
 614
 615         lock_comm(cinfo);
 616         ret = __sendmsg(cinfo, cmsg);
 617         unlock_comm(cinfo);
 618         return ret;
 619 }
 620
 621 static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 622 {
 623         struct md_cluster_info *cinfo = mddev->cluster_info;
 624         int i, ret = 0;
 625         struct dlm_lock_resource *bm_lockres;
 626         struct suspend_info *s;
 627         char str[64];
 628         sector_t lo, hi;
 629
 630
 631         for (i = 0; i < total_slots; i++) {
 632                 memset(str, '\0', 64);
 633                 snprintf(str, 64, "bitmap%04d", i);
 634                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 635                 if (!bm_lockres)
 636                         return -ENOMEM;
 637                 if (i == (cinfo->slot_number - 1))
 638                         continue;
 639
 640                 bm_lockres->flags |= DLM_LKF_NOQUEUE;
 641                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 642                 if (ret == -EAGAIN) {
 643                         memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
 644                         s = read_resync_info(mddev, bm_lockres);
 645                         if (s) {
 646                                 pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
 647                                                 __func__, __LINE__,
 648                                                 (unsigned long long) s->lo,
 649                                                 (unsigned long long) s->hi, i);
 650                                 spin_lock_irq(&cinfo->suspend_lock);
 651                                 s->slot = i;
 652                                 list_add(&s->list, &cinfo->suspend_list);
 653                                 spin_unlock_irq(&cinfo->suspend_lock);
 654                         }
 655                         ret = 0;
 656                         lockres_free(bm_lockres);
 657                         continue;
 658                 }
 659                 if (ret) {
 660                         lockres_free(bm_lockres);
 661                         goto out;
 662                 }
 663
 664                 /* Read the disk bitmap sb and check if it needs recovery */
 665                 ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
 666                 if (ret) {
 667                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
 668                         lockres_free(bm_lockres);
 669                         continue;
 670                 }
 671                 if ((hi > 0) && (lo < mddev->recovery_cp)) {
 672                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 673                         mddev->recovery_cp = lo;
 674                         md_check_recovery(mddev);
 675                 }
 676
 677                 dlm_unlock_sync(bm_lockres);
 678                 lockres_free(bm_lockres);
 679         }
 680 out:
 681         return ret;
 682 }
 683
 684 static int join(struct mddev *mddev, int nodes)
 685 {
 686         struct md_cluster_info *cinfo;
 687         int ret, ops_rv;
 688         char str[64];
 689
 690         cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
 691         if (!cinfo)
 692                 return -ENOMEM;
 693
 694         INIT_LIST_HEAD(&cinfo->suspend_list);
 695         spin_lock_init(&cinfo->suspend_lock);
 696         init_completion(&cinfo->completion);
 697         set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 698
 699         mutex_init(&cinfo->sb_mutex);
 700         mddev->cluster_info = cinfo;
 701
 702         memset(str, 0, 64);
 703         sprintf(str, "%pU", mddev->uuid);
 704         ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
 705                                 DLM_LSFL_FS, LVB_SIZE,
 706                                 &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
 707         if (ret)
 708                 goto err;
 709         wait_for_completion(&cinfo->completion);
 710         if (nodes < cinfo->slot_number) {
 711                 pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
 712                         cinfo->slot_number, nodes);
 713                 ret = -ERANGE;
 714                 goto err;
 715         }
 716         /* Initiate the communication resources */
 717         ret = -ENOMEM;
 718         cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
 719         if (!cinfo->recv_thread) {
 720                 pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
 721                 goto err;
 722         }
 723         cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
 724         if (!cinfo->message_lockres)
 725                 goto err;
 726         cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
 727         if (!cinfo->token_lockres)
 728                 goto err;
 729         cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
 730         if (!cinfo->ack_lockres)
 731                 goto err;
 732         cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
 733         if (!cinfo->no_new_dev_lockres)
 734                 goto err;
 735
 736         /* get sync CR lock on ACK. */
 737         if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
 738                 pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
 739                                 ret);
 740         /* get sync CR lock on no-new-dev. */
 741         if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
 742                 pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
 743
 744
 745         pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
 746         snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
 747         cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
 748         if (!cinfo->bitmap_lockres)
 749                 goto err;
 750         if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
 751                 pr_err("Failed to get bitmap lock\n");
 752                 ret = -EINVAL;
 753                 goto err;
 754         }
 755
 756         ret = gather_all_resync_info(mddev, nodes);
 757         if (ret)
 758                 goto err;
 759
 760         return 0;
 761 err:
 762         lockres_free(cinfo->message_lockres);
 763         lockres_free(cinfo->token_lockres);
 764         lockres_free(cinfo->ack_lockres);
 765         lockres_free(cinfo->no_new_dev_lockres);
 766         lockres_free(cinfo->bitmap_lockres);
 767         if (cinfo->lockspace)
 768                 dlm_release_lockspace(cinfo->lockspace, 2);
 769         mddev->cluster_info = NULL;
 770         kfree(cinfo);
 771         return ret;
 772 }
 773
 774 static int leave(struct mddev *mddev)
 775 {
 776         struct md_cluster_info *cinfo = mddev->cluster_info;
 777
 778         if (!cinfo)
 779                 return 0;
 780         md_unregister_thread(&cinfo->recovery_thread);
 781         md_unregister_thread(&cinfo->recv_thread);
 782         lockres_free(cinfo->message_lockres);
 783         lockres_free(cinfo->token_lockres);
 784         lockres_free(cinfo->ack_lockres);
 785         lockres_free(cinfo->no_new_dev_lockres);
 786         lockres_free(cinfo->bitmap_lockres);
 787         dlm_release_lockspace(cinfo->lockspace, 2);
 788         return 0;
 789 }
 790
 791 /* slot_number(): Returns the MD slot number to use
 792  * DLM starts the slot numbers from 1, wheras cluster-md
 793  * wants the number to be from zero, so we deduct one
 794  */
 795 static int slot_number(struct mddev *mddev)
 796 {
 797         struct md_cluster_info *cinfo = mddev->cluster_info;
 798
 799         return cinfo->slot_number - 1;
 800 }
 801
 802 static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 803 {
 804         struct md_cluster_info *cinfo = mddev->cluster_info;
 805
 806         add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 807         /* Re-acquire the lock to refresh LVB */
 808         dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
 809 }
 810
 811 static int metadata_update_start(struct mddev *mddev)
 812 {
 813         return lock_comm(mddev->cluster_info);
 814 }
 815
 816 static int metadata_update_finish(struct mddev *mddev)
 817 {
 818         struct md_cluster_info *cinfo = mddev->cluster_info;
 819         struct cluster_msg cmsg;
 820         int ret;
 821
 822         memset(&cmsg, 0, sizeof(cmsg));
 823         cmsg.type = cpu_to_le32(METADATA_UPDATED);
 824         ret = __sendmsg(cinfo, &cmsg);
 825         unlock_comm(cinfo);
 826         return ret;
 827 }
 828
 829 static int metadata_update_cancel(struct mddev *mddev)
 830 {
 831         struct md_cluster_info *cinfo = mddev->cluster_info;
 832
 833         return dlm_unlock_sync(cinfo->token_lockres);
 834 }
 835
 836 static int resync_send(struct mddev *mddev, enum msg_type type,
 837                 sector_t lo, sector_t hi)
 838 {
 839         struct md_cluster_info *cinfo = mddev->cluster_info;
 840         struct cluster_msg cmsg;
 841         int slot = cinfo->slot_number - 1;
 842
 843         pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
 844                         (unsigned long long)lo,
 845                         (unsigned long long)hi);
 846         resync_info_update(mddev, lo, hi);
 847         cmsg.type = cpu_to_le32(type);
 848         cmsg.slot = cpu_to_le32(slot);
 849         cmsg.low = cpu_to_le64(lo);
 850         cmsg.high = cpu_to_le64(hi);
 851         return sendmsg(cinfo, &cmsg);
 852 }
 853
 854 static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
 855 {
 856         pr_info("%s:%d\n", __func__, __LINE__);
 857         return resync_send(mddev, RESYNCING, lo, hi);
 858 }
 859
 860 static void resync_finish(struct mddev *mddev)
 861 {
 862         struct md_cluster_info *cinfo = mddev->cluster_info;
 863         struct cluster_msg cmsg;
 864         int slot = cinfo->slot_number - 1;
 865
 866         pr_info("%s:%d\n", __func__, __LINE__);
 867         resync_send(mddev, RESYNCING, 0, 0);
 868         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 869                 cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
 870                 cmsg.slot = cpu_to_le32(slot);
 871                 sendmsg(cinfo, &cmsg);
 872         }
 873 }
 874
 875 static int area_resyncing(struct mddev *mddev, int direction,
 876                 sector_t lo, sector_t hi)
 877 {
 878         struct md_cluster_info *cinfo = mddev->cluster_info;
 879         int ret = 0;
 880         struct suspend_info *s;
 881
 882         if ((direction == READ) &&
 883                 test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
 884                 return 1;
 885
 886         spin_lock_irq(&cinfo->suspend_lock);
 887         if (list_empty(&cinfo->suspend_list))
 888                 goto out;
 889         list_for_each_entry(s, &cinfo->suspend_list, list)
 890                 if (hi > s->lo && lo < s->hi) {
 891                         ret = 1;
 892                         break;
 893                 }
 894 out:
 895         spin_unlock_irq(&cinfo->suspend_lock);
 896         return ret;
 897 }
 898
 899 static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
 900 {
 901         struct md_cluster_info *cinfo = mddev->cluster_info;
 902         struct cluster_msg cmsg;
 903         int ret = 0;
 904         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
 905         char *uuid = sb->device_uuid;
 906
 907         memset(&cmsg, 0, sizeof(cmsg));
 908         cmsg.type = cpu_to_le32(NEWDISK);
 909         memcpy(cmsg.uuid, uuid, 16);
 910         cmsg.raid_slot = rdev->desc_nr;
 911         lock_comm(cinfo);
 912         ret = __sendmsg(cinfo, &cmsg);
 913         if (ret)
 914                 return ret;
 915         cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
 916         ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
 917         cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
 918         /* Some node does not "see" the device */
 919         if (ret == -EAGAIN)
 920                 ret = -ENOENT;
 921         else
 922                 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 923         return ret;
 924 }
 925
 926 static int add_new_disk_finish(struct mddev *mddev)
 927 {
 928         struct cluster_msg cmsg;
 929         struct md_cluster_info *cinfo = mddev->cluster_info;
 930         int ret;
 931         /* Write sb and inform others */
 932         md_update_sb(mddev, 1);
 933         cmsg.type = METADATA_UPDATED;
 934         ret = __sendmsg(cinfo, &cmsg);
 935         unlock_comm(cinfo);
 936         return ret;
 937 }
 938
 939 static int new_disk_ack(struct mddev *mddev, bool ack)
 940 {
 941         struct md_cluster_info *cinfo = mddev->cluster_info;
 942
 943         if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
 944                 pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
 945                 return -EINVAL;
 946         }
 947
 948         if (ack)
 949                 dlm_unlock_sync(cinfo->no_new_dev_lockres);
 950         complete(&cinfo->newdisk_completion);
 951         return 0;
 952 }
 953
 954 static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 955 {
 956         struct cluster_msg cmsg;
 957         struct md_cluster_info *cinfo = mddev->cluster_info;
 958         cmsg.type = REMOVE;
 959         cmsg.raid_slot = rdev->desc_nr;
 960         return __sendmsg(cinfo, &cmsg);
 961 }
 962
 963 static int gather_bitmaps(struct md_rdev *rdev)
 964 {
 965         int sn, err;
 966         sector_t lo, hi;
 967         struct cluster_msg cmsg;
 968         struct mddev *mddev = rdev->mddev;
 969         struct md_cluster_info *cinfo = mddev->cluster_info;
 970
 971         cmsg.type = RE_ADD;
 972         cmsg.raid_slot = rdev->desc_nr;
 973         err = sendmsg(cinfo, &cmsg);
 974         if (err)
 975                 goto out;
 976
 977         for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
 978                 if (sn == (cinfo->slot_number - 1))
 979                         continue;
 980                 err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
 981                 if (err) {
 982                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
 983                         goto out;
 984                 }
 985                 if ((hi > 0) && (lo < mddev->recovery_cp))
 986                         mddev->recovery_cp = lo;
 987         }
 988 out:
 989         return err;
 990 }
 991
 992 static struct md_cluster_operations cluster_ops = {
 993         .join   = join,
 994         .leave  = leave,
 995         .slot_number = slot_number,
 996         .resync_info_update = resync_info_update,
 997         .resync_start = resync_start,
 998         .resync_finish = resync_finish,
 999         .metadata_update_start = metadata_update_start,
1000         .metadata_update_finish = metadata_update_finish,
1001         .metadata_update_cancel = metadata_update_cancel,
1002         .area_resyncing = area_resyncing,
1003         .add_new_disk_start = add_new_disk_start,
1004         .add_new_disk_finish = add_new_disk_finish,
1005         .new_disk_ack = new_disk_ack,
1006         .remove_disk = remove_disk,
1007         .gather_bitmaps = gather_bitmaps,
1008 };
1009
1010 static int __init cluster_init(void)
1011 {
1012         pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
1013         pr_info("Registering Cluster MD functions\n");
1014         register_md_cluster_operations(&cluster_ops, THIS_MODULE);
1015         return 0;
1016 }
1017
1018 static void cluster_exit(void)
1019 {
1020         unregister_md_cluster_operations();
1021 }
1022
1023 module_init(cluster_init);
1024 module_exit(cluster_exit);
1025 MODULE_LICENSE("GPL");
1026 MODULE_DESCRIPTION("Clustering support for MD");