]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/dm-mpath.c
8336332bd61f0c8c1ddf41000f811e80383dd3a7
[karo-tx-linux.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm-rq.h"
11 #include "dm-bio-record.h"
12 #include "dm-path-selector.h"
13 #include "dm-uevent.h"
14
15 #include <linux/blkdev.h>
16 #include <linux/ctype.h>
17 #include <linux/init.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/time.h>
23 #include <linux/workqueue.h>
24 #include <linux/delay.h>
25 #include <scsi/scsi_dh.h>
26 #include <linux/atomic.h>
27 #include <linux/blk-mq.h>
28
29 #define DM_MSG_PREFIX "multipath"
30 #define DM_PG_INIT_DELAY_MSECS 2000
31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
32
33 /* Path properties */
34 struct pgpath {
35         struct list_head list;
36
37         struct priority_group *pg;      /* Owning PG */
38         unsigned fail_count;            /* Cumulative failure count */
39
40         struct dm_path path;
41         struct delayed_work activate_path;
42
43         bool is_active:1;               /* Path status */
44 };
45
46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
47
48 /*
49  * Paths are grouped into Priority Groups and numbered from 1 upwards.
50  * Each has a path selector which controls which path gets used.
51  */
52 struct priority_group {
53         struct list_head list;
54
55         struct multipath *m;            /* Owning multipath instance */
56         struct path_selector ps;
57
58         unsigned pg_num;                /* Reference number */
59         unsigned nr_pgpaths;            /* Number of paths in PG */
60         struct list_head pgpaths;
61
62         bool bypassed:1;                /* Temporarily bypass this PG? */
63 };
64
65 /* Multipath context */
66 struct multipath {
67         struct list_head list;
68         struct dm_target *ti;
69
70         const char *hw_handler_name;
71         char *hw_handler_params;
72
73         spinlock_t lock;
74
75         unsigned nr_priority_groups;
76         struct list_head priority_groups;
77
78         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
79
80         struct pgpath *current_pgpath;
81         struct priority_group *current_pg;
82         struct priority_group *next_pg; /* Switch to this PG if set */
83
84         unsigned long flags;            /* Multipath state flags */
85
86         unsigned pg_init_retries;       /* Number of times to retry pg_init */
87         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
88
89         atomic_t nr_valid_paths;        /* Total number of usable paths */
90         atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
91         atomic_t pg_init_count;         /* Number of times pg_init called */
92
93         enum dm_queue_mode queue_mode;
94
95         struct mutex work_mutex;
96         struct work_struct trigger_event;
97
98         struct work_struct process_queued_bios;
99         struct bio_list queued_bios;
100 };
101
102 /*
103  * Context information attached to each io we process.
104  */
105 struct dm_mpath_io {
106         struct pgpath *pgpath;
107         size_t nr_bytes;
108 };
109
110 typedef int (*action_fn) (struct pgpath *pgpath);
111
112 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
113 static void trigger_event(struct work_struct *work);
114 static void activate_or_offline_path(struct pgpath *pgpath);
115 static void activate_path_work(struct work_struct *work);
116 static void process_queued_bios(struct work_struct *work);
117
118 /*-----------------------------------------------
119  * Multipath state flags.
120  *-----------------------------------------------*/
121
122 #define MPATHF_QUEUE_IO 0                       /* Must we queue all I/O? */
123 #define MPATHF_QUEUE_IF_NO_PATH 1               /* Queue I/O if last path fails? */
124 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2         /* Saved state during suspension */
125 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3     /* If there's already a hw_handler present, don't change it. */
126 #define MPATHF_PG_INIT_DISABLED 4               /* pg_init is not currently allowed */
127 #define MPATHF_PG_INIT_REQUIRED 5               /* pg_init needs calling? */
128 #define MPATHF_PG_INIT_DELAY_RETRY 6            /* Delay pg_init retry? */
129
130 /*-----------------------------------------------
131  * Allocation routines
132  *-----------------------------------------------*/
133
134 static struct pgpath *alloc_pgpath(void)
135 {
136         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
137
138         if (pgpath) {
139                 pgpath->is_active = true;
140                 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
141         }
142
143         return pgpath;
144 }
145
146 static void free_pgpath(struct pgpath *pgpath)
147 {
148         kfree(pgpath);
149 }
150
151 static struct priority_group *alloc_priority_group(void)
152 {
153         struct priority_group *pg;
154
155         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
156
157         if (pg)
158                 INIT_LIST_HEAD(&pg->pgpaths);
159
160         return pg;
161 }
162
163 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
164 {
165         struct pgpath *pgpath, *tmp;
166
167         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
168                 list_del(&pgpath->list);
169                 dm_put_device(ti, pgpath->path.dev);
170                 free_pgpath(pgpath);
171         }
172 }
173
174 static void free_priority_group(struct priority_group *pg,
175                                 struct dm_target *ti)
176 {
177         struct path_selector *ps = &pg->ps;
178
179         if (ps->type) {
180                 ps->type->destroy(ps);
181                 dm_put_path_selector(ps->type);
182         }
183
184         free_pgpaths(&pg->pgpaths, ti);
185         kfree(pg);
186 }
187
188 static struct multipath *alloc_multipath(struct dm_target *ti)
189 {
190         struct multipath *m;
191
192         m = kzalloc(sizeof(*m), GFP_KERNEL);
193         if (m) {
194                 INIT_LIST_HEAD(&m->priority_groups);
195                 spin_lock_init(&m->lock);
196                 set_bit(MPATHF_QUEUE_IO, &m->flags);
197                 atomic_set(&m->nr_valid_paths, 0);
198                 atomic_set(&m->pg_init_in_progress, 0);
199                 atomic_set(&m->pg_init_count, 0);
200                 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
201                 INIT_WORK(&m->trigger_event, trigger_event);
202                 init_waitqueue_head(&m->pg_init_wait);
203                 mutex_init(&m->work_mutex);
204
205                 m->queue_mode = DM_TYPE_NONE;
206
207                 m->ti = ti;
208                 ti->private = m;
209         }
210
211         return m;
212 }
213
214 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
215 {
216         if (m->queue_mode == DM_TYPE_NONE) {
217                 /*
218                  * Default to request-based.
219                  */
220                 if (dm_use_blk_mq(dm_table_get_md(ti->table)))
221                         m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
222                 else
223                         m->queue_mode = DM_TYPE_REQUEST_BASED;
224         } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
225                 INIT_WORK(&m->process_queued_bios, process_queued_bios);
226                 /*
227                  * bio-based doesn't support any direct scsi_dh management;
228                  * it just discovers if a scsi_dh is attached.
229                  */
230                 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
231         }
232
233         dm_table_set_type(ti->table, m->queue_mode);
234
235         return 0;
236 }
237
238 static void free_multipath(struct multipath *m)
239 {
240         struct priority_group *pg, *tmp;
241
242         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
243                 list_del(&pg->list);
244                 free_priority_group(pg, m->ti);
245         }
246
247         kfree(m->hw_handler_name);
248         kfree(m->hw_handler_params);
249         kfree(m);
250 }
251
252 static struct dm_mpath_io *get_mpio(union map_info *info)
253 {
254         return info->ptr;
255 }
256
257 static size_t multipath_per_bio_data_size(void)
258 {
259         return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
260 }
261
262 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
263 {
264         return dm_per_bio_data(bio, multipath_per_bio_data_size());
265 }
266
267 static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
268 {
269         /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
270         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
271         void *bio_details = mpio + 1;
272
273         return bio_details;
274 }
275
276 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p,
277                                         struct dm_bio_details **bio_details_p)
278 {
279         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
280         struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
281
282         memset(mpio, 0, sizeof(*mpio));
283         memset(bio_details, 0, sizeof(*bio_details));
284         dm_bio_record(bio_details, bio);
285
286         if (mpio_p)
287                 *mpio_p = mpio;
288         if (bio_details_p)
289                 *bio_details_p = bio_details;
290 }
291
292 /*-----------------------------------------------
293  * Path selection
294  *-----------------------------------------------*/
295
296 static int __pg_init_all_paths(struct multipath *m)
297 {
298         struct pgpath *pgpath;
299         unsigned long pg_init_delay = 0;
300
301         lockdep_assert_held(&m->lock);
302
303         if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
304                 return 0;
305
306         atomic_inc(&m->pg_init_count);
307         clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
308
309         /* Check here to reset pg_init_required */
310         if (!m->current_pg)
311                 return 0;
312
313         if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
314                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
315                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
316         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
317                 /* Skip failed paths */
318                 if (!pgpath->is_active)
319                         continue;
320                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
321                                        pg_init_delay))
322                         atomic_inc(&m->pg_init_in_progress);
323         }
324         return atomic_read(&m->pg_init_in_progress);
325 }
326
327 static int pg_init_all_paths(struct multipath *m)
328 {
329         int ret;
330         unsigned long flags;
331
332         spin_lock_irqsave(&m->lock, flags);
333         ret = __pg_init_all_paths(m);
334         spin_unlock_irqrestore(&m->lock, flags);
335
336         return ret;
337 }
338
339 static void __switch_pg(struct multipath *m, struct priority_group *pg)
340 {
341         m->current_pg = pg;
342
343         /* Must we initialise the PG first, and queue I/O till it's ready? */
344         if (m->hw_handler_name) {
345                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
346                 set_bit(MPATHF_QUEUE_IO, &m->flags);
347         } else {
348                 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
349                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
350         }
351
352         atomic_set(&m->pg_init_count, 0);
353 }
354
355 static struct pgpath *choose_path_in_pg(struct multipath *m,
356                                         struct priority_group *pg,
357                                         size_t nr_bytes)
358 {
359         unsigned long flags;
360         struct dm_path *path;
361         struct pgpath *pgpath;
362
363         path = pg->ps.type->select_path(&pg->ps, nr_bytes);
364         if (!path)
365                 return ERR_PTR(-ENXIO);
366
367         pgpath = path_to_pgpath(path);
368
369         if (unlikely(lockless_dereference(m->current_pg) != pg)) {
370                 /* Only update current_pgpath if pg changed */
371                 spin_lock_irqsave(&m->lock, flags);
372                 m->current_pgpath = pgpath;
373                 __switch_pg(m, pg);
374                 spin_unlock_irqrestore(&m->lock, flags);
375         }
376
377         return pgpath;
378 }
379
380 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
381 {
382         unsigned long flags;
383         struct priority_group *pg;
384         struct pgpath *pgpath;
385         unsigned bypassed = 1;
386
387         if (!atomic_read(&m->nr_valid_paths)) {
388                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
389                 goto failed;
390         }
391
392         /* Were we instructed to switch PG? */
393         if (lockless_dereference(m->next_pg)) {
394                 spin_lock_irqsave(&m->lock, flags);
395                 pg = m->next_pg;
396                 if (!pg) {
397                         spin_unlock_irqrestore(&m->lock, flags);
398                         goto check_current_pg;
399                 }
400                 m->next_pg = NULL;
401                 spin_unlock_irqrestore(&m->lock, flags);
402                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
403                 if (!IS_ERR_OR_NULL(pgpath))
404                         return pgpath;
405         }
406
407         /* Don't change PG until it has no remaining paths */
408 check_current_pg:
409         pg = lockless_dereference(m->current_pg);
410         if (pg) {
411                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
412                 if (!IS_ERR_OR_NULL(pgpath))
413                         return pgpath;
414         }
415
416         /*
417          * Loop through priority groups until we find a valid path.
418          * First time we skip PGs marked 'bypassed'.
419          * Second time we only try the ones we skipped, but set
420          * pg_init_delay_retry so we do not hammer controllers.
421          */
422         do {
423                 list_for_each_entry(pg, &m->priority_groups, list) {
424                         if (pg->bypassed == !!bypassed)
425                                 continue;
426                         pgpath = choose_path_in_pg(m, pg, nr_bytes);
427                         if (!IS_ERR_OR_NULL(pgpath)) {
428                                 if (!bypassed)
429                                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
430                                 return pgpath;
431                         }
432                 }
433         } while (bypassed--);
434
435 failed:
436         spin_lock_irqsave(&m->lock, flags);
437         m->current_pgpath = NULL;
438         m->current_pg = NULL;
439         spin_unlock_irqrestore(&m->lock, flags);
440
441         return NULL;
442 }
443
444 /*
445  * Check whether bios must be queued in the device-mapper core rather
446  * than here in the target.
447  *
448  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
449  * same value then we are not between multipath_presuspend()
450  * and multipath_resume() calls and we have no need to check
451  * for the DMF_NOFLUSH_SUSPENDING flag.
452  */
453 static bool __must_push_back(struct multipath *m)
454 {
455         return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
456                  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
457                 dm_noflush_suspending(m->ti));
458 }
459
460 static bool must_push_back_rq(struct multipath *m)
461 {
462         bool r;
463         unsigned long flags;
464
465         spin_lock_irqsave(&m->lock, flags);
466         r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
467              __must_push_back(m));
468         spin_unlock_irqrestore(&m->lock, flags);
469
470         return r;
471 }
472
473 static bool must_push_back_bio(struct multipath *m)
474 {
475         bool r;
476         unsigned long flags;
477
478         spin_lock_irqsave(&m->lock, flags);
479         r = __must_push_back(m);
480         spin_unlock_irqrestore(&m->lock, flags);
481
482         return r;
483 }
484
485 /*
486  * Map cloned requests (request-based multipath)
487  */
488 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
489                                    union map_info *map_context,
490                                    struct request **__clone)
491 {
492         struct multipath *m = ti->private;
493         size_t nr_bytes = blk_rq_bytes(rq);
494         struct pgpath *pgpath;
495         struct block_device *bdev;
496         struct dm_mpath_io *mpio = get_mpio(map_context);
497         struct request_queue *q;
498         struct request *clone;
499
500         /* Do we need to select a new pgpath? */
501         pgpath = lockless_dereference(m->current_pgpath);
502         if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
503                 pgpath = choose_pgpath(m, nr_bytes);
504
505         if (!pgpath) {
506                 if (must_push_back_rq(m))
507                         return DM_MAPIO_DELAY_REQUEUE;
508                 return -EIO;    /* Failed */
509         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
510                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
511                 if (pg_init_all_paths(m))
512                         return DM_MAPIO_DELAY_REQUEUE;
513                 return DM_MAPIO_REQUEUE;
514         }
515
516         memset(mpio, 0, sizeof(*mpio));
517         mpio->pgpath = pgpath;
518         mpio->nr_bytes = nr_bytes;
519
520         bdev = pgpath->path.dev->bdev;
521         q = bdev_get_queue(bdev);
522         clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
523         if (IS_ERR(clone)) {
524                 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
525                 bool queue_dying = blk_queue_dying(q);
526                 DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
527                             PTR_ERR(clone), queue_dying ? " (path offline)" : "");
528                 if (queue_dying) {
529                         atomic_inc(&m->pg_init_in_progress);
530                         activate_or_offline_path(pgpath);
531                         return DM_MAPIO_REQUEUE;
532                 }
533                 return DM_MAPIO_DELAY_REQUEUE;
534         }
535         clone->bio = clone->biotail = NULL;
536         clone->rq_disk = bdev->bd_disk;
537         clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
538         *__clone = clone;
539
540         if (pgpath->pg->ps.type->start_io)
541                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
542                                               &pgpath->path,
543                                               nr_bytes);
544         return DM_MAPIO_REMAPPED;
545 }
546
547 static void multipath_release_clone(struct request *clone)
548 {
549         blk_put_request(clone);
550 }
551
552 /*
553  * Map cloned bios (bio-based multipath)
554  */
555 static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
556 {
557         size_t nr_bytes = bio->bi_iter.bi_size;
558         struct pgpath *pgpath;
559         unsigned long flags;
560         bool queue_io;
561
562         /* Do we need to select a new pgpath? */
563         pgpath = lockless_dereference(m->current_pgpath);
564         queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
565         if (!pgpath || !queue_io)
566                 pgpath = choose_pgpath(m, nr_bytes);
567
568         if ((pgpath && queue_io) ||
569             (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
570                 /* Queue for the daemon to resubmit */
571                 spin_lock_irqsave(&m->lock, flags);
572                 bio_list_add(&m->queued_bios, bio);
573                 spin_unlock_irqrestore(&m->lock, flags);
574                 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
575                 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
576                         pg_init_all_paths(m);
577                 else if (!queue_io)
578                         queue_work(kmultipathd, &m->process_queued_bios);
579                 return DM_MAPIO_SUBMITTED;
580         }
581
582         if (!pgpath) {
583                 if (!must_push_back_bio(m))
584                         return -EIO;
585                 return DM_MAPIO_REQUEUE;
586         }
587
588         mpio->pgpath = pgpath;
589         mpio->nr_bytes = nr_bytes;
590
591         bio->bi_error = 0;
592         bio->bi_bdev = pgpath->path.dev->bdev;
593         bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
594
595         if (pgpath->pg->ps.type->start_io)
596                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
597                                               &pgpath->path,
598                                               nr_bytes);
599         return DM_MAPIO_REMAPPED;
600 }
601
602 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
603 {
604         struct multipath *m = ti->private;
605         struct dm_mpath_io *mpio = NULL;
606
607         multipath_init_per_bio_data(bio, &mpio, NULL);
608
609         return __multipath_map_bio(m, bio, mpio);
610 }
611
612 static void process_queued_io_list(struct multipath *m)
613 {
614         if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
615                 dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
616         else if (m->queue_mode == DM_TYPE_BIO_BASED)
617                 queue_work(kmultipathd, &m->process_queued_bios);
618 }
619
620 static void process_queued_bios(struct work_struct *work)
621 {
622         int r;
623         unsigned long flags;
624         struct bio *bio;
625         struct bio_list bios;
626         struct blk_plug plug;
627         struct multipath *m =
628                 container_of(work, struct multipath, process_queued_bios);
629
630         bio_list_init(&bios);
631
632         spin_lock_irqsave(&m->lock, flags);
633
634         if (bio_list_empty(&m->queued_bios)) {
635                 spin_unlock_irqrestore(&m->lock, flags);
636                 return;
637         }
638
639         bio_list_merge(&bios, &m->queued_bios);
640         bio_list_init(&m->queued_bios);
641
642         spin_unlock_irqrestore(&m->lock, flags);
643
644         blk_start_plug(&plug);
645         while ((bio = bio_list_pop(&bios))) {
646                 r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
647                 if (r < 0 || r == DM_MAPIO_REQUEUE) {
648                         bio->bi_error = r;
649                         bio_endio(bio);
650                 } else if (r == DM_MAPIO_REMAPPED)
651                         generic_make_request(bio);
652         }
653         blk_finish_plug(&plug);
654 }
655
656 /*
657  * If we run out of usable paths, should we queue I/O or error it?
658  */
659 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
660                             bool save_old_value)
661 {
662         unsigned long flags;
663
664         spin_lock_irqsave(&m->lock, flags);
665
666         if (save_old_value) {
667                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
668                         set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
669                 else
670                         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
671         } else {
672                 if (queue_if_no_path)
673                         set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
674                 else
675                         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
676         }
677         if (queue_if_no_path)
678                 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
679         else
680                 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
681
682         spin_unlock_irqrestore(&m->lock, flags);
683
684         if (!queue_if_no_path) {
685                 dm_table_run_md_queue_async(m->ti->table);
686                 process_queued_io_list(m);
687         }
688
689         return 0;
690 }
691
692 /*
693  * An event is triggered whenever a path is taken out of use.
694  * Includes path failure and PG bypass.
695  */
696 static void trigger_event(struct work_struct *work)
697 {
698         struct multipath *m =
699                 container_of(work, struct multipath, trigger_event);
700
701         dm_table_event(m->ti->table);
702 }
703
704 /*-----------------------------------------------------------------
705  * Constructor/argument parsing:
706  * <#multipath feature args> [<arg>]*
707  * <#hw_handler args> [hw_handler [<arg>]*]
708  * <#priority groups>
709  * <initial priority group>
710  *     [<selector> <#selector args> [<arg>]*
711  *      <#paths> <#per-path selector args>
712  *         [<path> [<arg>]* ]+ ]+
713  *---------------------------------------------------------------*/
714 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
715                                struct dm_target *ti)
716 {
717         int r;
718         struct path_selector_type *pst;
719         unsigned ps_argc;
720
721         static struct dm_arg _args[] = {
722                 {0, 1024, "invalid number of path selector args"},
723         };
724
725         pst = dm_get_path_selector(dm_shift_arg(as));
726         if (!pst) {
727                 ti->error = "unknown path selector type";
728                 return -EINVAL;
729         }
730
731         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
732         if (r) {
733                 dm_put_path_selector(pst);
734                 return -EINVAL;
735         }
736
737         r = pst->create(&pg->ps, ps_argc, as->argv);
738         if (r) {
739                 dm_put_path_selector(pst);
740                 ti->error = "path selector constructor failed";
741                 return r;
742         }
743
744         pg->ps.type = pst;
745         dm_consume_args(as, ps_argc);
746
747         return 0;
748 }
749
750 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
751                                struct dm_target *ti)
752 {
753         int r;
754         struct pgpath *p;
755         struct multipath *m = ti->private;
756         struct request_queue *q = NULL;
757         const char *attached_handler_name;
758
759         /* we need at least a path arg */
760         if (as->argc < 1) {
761                 ti->error = "no device given";
762                 return ERR_PTR(-EINVAL);
763         }
764
765         p = alloc_pgpath();
766         if (!p)
767                 return ERR_PTR(-ENOMEM);
768
769         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
770                           &p->path.dev);
771         if (r) {
772                 ti->error = "error getting device";
773                 goto bad;
774         }
775
776         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
777                 q = bdev_get_queue(p->path.dev->bdev);
778
779         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
780 retain:
781                 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
782                 if (attached_handler_name) {
783                         /*
784                          * Clear any hw_handler_params associated with a
785                          * handler that isn't already attached.
786                          */
787                         if (m->hw_handler_name && strcmp(attached_handler_name, m->hw_handler_name)) {
788                                 kfree(m->hw_handler_params);
789                                 m->hw_handler_params = NULL;
790                         }
791
792                         /*
793                          * Reset hw_handler_name to match the attached handler
794                          *
795                          * NB. This modifies the table line to show the actual
796                          * handler instead of the original table passed in.
797                          */
798                         kfree(m->hw_handler_name);
799                         m->hw_handler_name = attached_handler_name;
800                 }
801         }
802
803         if (m->hw_handler_name) {
804                 r = scsi_dh_attach(q, m->hw_handler_name);
805                 if (r == -EBUSY) {
806                         char b[BDEVNAME_SIZE];
807
808                         printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
809                                 bdevname(p->path.dev->bdev, b));
810                         goto retain;
811                 }
812                 if (r < 0) {
813                         ti->error = "error attaching hardware handler";
814                         dm_put_device(ti, p->path.dev);
815                         goto bad;
816                 }
817
818                 if (m->hw_handler_params) {
819                         r = scsi_dh_set_params(q, m->hw_handler_params);
820                         if (r < 0) {
821                                 ti->error = "unable to set hardware "
822                                                         "handler parameters";
823                                 dm_put_device(ti, p->path.dev);
824                                 goto bad;
825                         }
826                 }
827         }
828
829         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
830         if (r) {
831                 dm_put_device(ti, p->path.dev);
832                 goto bad;
833         }
834
835         return p;
836
837  bad:
838         free_pgpath(p);
839         return ERR_PTR(r);
840 }
841
842 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
843                                                    struct multipath *m)
844 {
845         static struct dm_arg _args[] = {
846                 {1, 1024, "invalid number of paths"},
847                 {0, 1024, "invalid number of selector args"}
848         };
849
850         int r;
851         unsigned i, nr_selector_args, nr_args;
852         struct priority_group *pg;
853         struct dm_target *ti = m->ti;
854
855         if (as->argc < 2) {
856                 as->argc = 0;
857                 ti->error = "not enough priority group arguments";
858                 return ERR_PTR(-EINVAL);
859         }
860
861         pg = alloc_priority_group();
862         if (!pg) {
863                 ti->error = "couldn't allocate priority group";
864                 return ERR_PTR(-ENOMEM);
865         }
866         pg->m = m;
867
868         r = parse_path_selector(as, pg, ti);
869         if (r)
870                 goto bad;
871
872         /*
873          * read the paths
874          */
875         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
876         if (r)
877                 goto bad;
878
879         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
880         if (r)
881                 goto bad;
882
883         nr_args = 1 + nr_selector_args;
884         for (i = 0; i < pg->nr_pgpaths; i++) {
885                 struct pgpath *pgpath;
886                 struct dm_arg_set path_args;
887
888                 if (as->argc < nr_args) {
889                         ti->error = "not enough path parameters";
890                         r = -EINVAL;
891                         goto bad;
892                 }
893
894                 path_args.argc = nr_args;
895                 path_args.argv = as->argv;
896
897                 pgpath = parse_path(&path_args, &pg->ps, ti);
898                 if (IS_ERR(pgpath)) {
899                         r = PTR_ERR(pgpath);
900                         goto bad;
901                 }
902
903                 pgpath->pg = pg;
904                 list_add_tail(&pgpath->list, &pg->pgpaths);
905                 dm_consume_args(as, nr_args);
906         }
907
908         return pg;
909
910  bad:
911         free_priority_group(pg, ti);
912         return ERR_PTR(r);
913 }
914
915 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
916 {
917         unsigned hw_argc;
918         int ret;
919         struct dm_target *ti = m->ti;
920
921         static struct dm_arg _args[] = {
922                 {0, 1024, "invalid number of hardware handler args"},
923         };
924
925         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
926                 return -EINVAL;
927
928         if (!hw_argc)
929                 return 0;
930
931         if (m->queue_mode == DM_TYPE_BIO_BASED) {
932                 dm_consume_args(as, hw_argc);
933                 DMERR("bio-based multipath doesn't allow hardware handler args");
934                 return 0;
935         }
936
937         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
938         if (!m->hw_handler_name)
939                 return -EINVAL;
940
941         if (hw_argc > 1) {
942                 char *p;
943                 int i, j, len = 4;
944
945                 for (i = 0; i <= hw_argc - 2; i++)
946                         len += strlen(as->argv[i]) + 1;
947                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
948                 if (!p) {
949                         ti->error = "memory allocation failed";
950                         ret = -ENOMEM;
951                         goto fail;
952                 }
953                 j = sprintf(p, "%d", hw_argc - 1);
954                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
955                         j = sprintf(p, "%s", as->argv[i]);
956         }
957         dm_consume_args(as, hw_argc - 1);
958
959         return 0;
960 fail:
961         kfree(m->hw_handler_name);
962         m->hw_handler_name = NULL;
963         return ret;
964 }
965
966 static int parse_features(struct dm_arg_set *as, struct multipath *m)
967 {
968         int r;
969         unsigned argc;
970         struct dm_target *ti = m->ti;
971         const char *arg_name;
972
973         static struct dm_arg _args[] = {
974                 {0, 8, "invalid number of feature args"},
975                 {1, 50, "pg_init_retries must be between 1 and 50"},
976                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
977         };
978
979         r = dm_read_arg_group(_args, as, &argc, &ti->error);
980         if (r)
981                 return -EINVAL;
982
983         if (!argc)
984                 return 0;
985
986         do {
987                 arg_name = dm_shift_arg(as);
988                 argc--;
989
990                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
991                         r = queue_if_no_path(m, true, false);
992                         continue;
993                 }
994
995                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
996                         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
997                         continue;
998                 }
999
1000                 if (!strcasecmp(arg_name, "pg_init_retries") &&
1001                     (argc >= 1)) {
1002                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1003                         argc--;
1004                         continue;
1005                 }
1006
1007                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1008                     (argc >= 1)) {
1009                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1010                         argc--;
1011                         continue;
1012                 }
1013
1014                 if (!strcasecmp(arg_name, "queue_mode") &&
1015                     (argc >= 1)) {
1016                         const char *queue_mode_name = dm_shift_arg(as);
1017
1018                         if (!strcasecmp(queue_mode_name, "bio"))
1019                                 m->queue_mode = DM_TYPE_BIO_BASED;
1020                         else if (!strcasecmp(queue_mode_name, "rq"))
1021                                 m->queue_mode = DM_TYPE_REQUEST_BASED;
1022                         else if (!strcasecmp(queue_mode_name, "mq"))
1023                                 m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
1024                         else {
1025                                 ti->error = "Unknown 'queue_mode' requested";
1026                                 r = -EINVAL;
1027                         }
1028                         argc--;
1029                         continue;
1030                 }
1031
1032                 ti->error = "Unrecognised multipath feature request";
1033                 r = -EINVAL;
1034         } while (argc && !r);
1035
1036         return r;
1037 }
1038
1039 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1040 {
1041         /* target arguments */
1042         static struct dm_arg _args[] = {
1043                 {0, 1024, "invalid number of priority groups"},
1044                 {0, 1024, "invalid initial priority group number"},
1045         };
1046
1047         int r;
1048         struct multipath *m;
1049         struct dm_arg_set as;
1050         unsigned pg_count = 0;
1051         unsigned next_pg_num;
1052
1053         as.argc = argc;
1054         as.argv = argv;
1055
1056         m = alloc_multipath(ti);
1057         if (!m) {
1058                 ti->error = "can't allocate multipath";
1059                 return -EINVAL;
1060         }
1061
1062         r = parse_features(&as, m);
1063         if (r)
1064                 goto bad;
1065
1066         r = alloc_multipath_stage2(ti, m);
1067         if (r)
1068                 goto bad;
1069
1070         r = parse_hw_handler(&as, m);
1071         if (r)
1072                 goto bad;
1073
1074         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1075         if (r)
1076                 goto bad;
1077
1078         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1079         if (r)
1080                 goto bad;
1081
1082         if ((!m->nr_priority_groups && next_pg_num) ||
1083             (m->nr_priority_groups && !next_pg_num)) {
1084                 ti->error = "invalid initial priority group";
1085                 r = -EINVAL;
1086                 goto bad;
1087         }
1088
1089         /* parse the priority groups */
1090         while (as.argc) {
1091                 struct priority_group *pg;
1092                 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1093
1094                 pg = parse_priority_group(&as, m);
1095                 if (IS_ERR(pg)) {
1096                         r = PTR_ERR(pg);
1097                         goto bad;
1098                 }
1099
1100                 nr_valid_paths += pg->nr_pgpaths;
1101                 atomic_set(&m->nr_valid_paths, nr_valid_paths);
1102
1103                 list_add_tail(&pg->list, &m->priority_groups);
1104                 pg_count++;
1105                 pg->pg_num = pg_count;
1106                 if (!--next_pg_num)
1107                         m->next_pg = pg;
1108         }
1109
1110         if (pg_count != m->nr_priority_groups) {
1111                 ti->error = "priority group count mismatch";
1112                 r = -EINVAL;
1113                 goto bad;
1114         }
1115
1116         ti->num_flush_bios = 1;
1117         ti->num_discard_bios = 1;
1118         ti->num_write_same_bios = 1;
1119         if (m->queue_mode == DM_TYPE_BIO_BASED)
1120                 ti->per_io_data_size = multipath_per_bio_data_size();
1121         else
1122                 ti->per_io_data_size = sizeof(struct dm_mpath_io);
1123
1124         return 0;
1125
1126  bad:
1127         free_multipath(m);
1128         return r;
1129 }
1130
1131 static void multipath_wait_for_pg_init_completion(struct multipath *m)
1132 {
1133         DEFINE_WAIT(wait);
1134
1135         while (1) {
1136                 prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
1137
1138                 if (!atomic_read(&m->pg_init_in_progress))
1139                         break;
1140
1141                 io_schedule();
1142         }
1143         finish_wait(&m->pg_init_wait, &wait);
1144 }
1145
1146 static void flush_multipath_work(struct multipath *m)
1147 {
1148         set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1149         smp_mb__after_atomic();
1150
1151         flush_workqueue(kmpath_handlerd);
1152         multipath_wait_for_pg_init_completion(m);
1153         flush_workqueue(kmultipathd);
1154         flush_work(&m->trigger_event);
1155
1156         clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1157         smp_mb__after_atomic();
1158 }
1159
1160 static void multipath_dtr(struct dm_target *ti)
1161 {
1162         struct multipath *m = ti->private;
1163
1164         flush_multipath_work(m);
1165         free_multipath(m);
1166 }
1167
1168 /*
1169  * Take a path out of use.
1170  */
1171 static int fail_path(struct pgpath *pgpath)
1172 {
1173         unsigned long flags;
1174         struct multipath *m = pgpath->pg->m;
1175
1176         spin_lock_irqsave(&m->lock, flags);
1177
1178         if (!pgpath->is_active)
1179                 goto out;
1180
1181         DMWARN("Failing path %s.", pgpath->path.dev->name);
1182
1183         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1184         pgpath->is_active = false;
1185         pgpath->fail_count++;
1186
1187         atomic_dec(&m->nr_valid_paths);
1188
1189         if (pgpath == m->current_pgpath)
1190                 m->current_pgpath = NULL;
1191
1192         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1193                        pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1194
1195         schedule_work(&m->trigger_event);
1196
1197 out:
1198         spin_unlock_irqrestore(&m->lock, flags);
1199
1200         return 0;
1201 }
1202
1203 /*
1204  * Reinstate a previously-failed path
1205  */
1206 static int reinstate_path(struct pgpath *pgpath)
1207 {
1208         int r = 0, run_queue = 0;
1209         unsigned long flags;
1210         struct multipath *m = pgpath->pg->m;
1211         unsigned nr_valid_paths;
1212
1213         spin_lock_irqsave(&m->lock, flags);
1214
1215         if (pgpath->is_active)
1216                 goto out;
1217
1218         DMWARN("Reinstating path %s.", pgpath->path.dev->name);
1219
1220         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1221         if (r)
1222                 goto out;
1223
1224         pgpath->is_active = true;
1225
1226         nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1227         if (nr_valid_paths == 1) {
1228                 m->current_pgpath = NULL;
1229                 run_queue = 1;
1230         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1231                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1232                         atomic_inc(&m->pg_init_in_progress);
1233         }
1234
1235         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1236                        pgpath->path.dev->name, nr_valid_paths);
1237
1238         schedule_work(&m->trigger_event);
1239
1240 out:
1241         spin_unlock_irqrestore(&m->lock, flags);
1242         if (run_queue) {
1243                 dm_table_run_md_queue_async(m->ti->table);
1244                 process_queued_io_list(m);
1245         }
1246
1247         return r;
1248 }
1249
1250 /*
1251  * Fail or reinstate all paths that match the provided struct dm_dev.
1252  */
1253 static int action_dev(struct multipath *m, struct dm_dev *dev,
1254                       action_fn action)
1255 {
1256         int r = -EINVAL;
1257         struct pgpath *pgpath;
1258         struct priority_group *pg;
1259
1260         list_for_each_entry(pg, &m->priority_groups, list) {
1261                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1262                         if (pgpath->path.dev == dev)
1263                                 r = action(pgpath);
1264                 }
1265         }
1266
1267         return r;
1268 }
1269
1270 /*
1271  * Temporarily try to avoid having to use the specified PG
1272  */
1273 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1274                       bool bypassed)
1275 {
1276         unsigned long flags;
1277
1278         spin_lock_irqsave(&m->lock, flags);
1279
1280         pg->bypassed = bypassed;
1281         m->current_pgpath = NULL;
1282         m->current_pg = NULL;
1283
1284         spin_unlock_irqrestore(&m->lock, flags);
1285
1286         schedule_work(&m->trigger_event);
1287 }
1288
1289 /*
1290  * Switch to using the specified PG from the next I/O that gets mapped
1291  */
1292 static int switch_pg_num(struct multipath *m, const char *pgstr)
1293 {
1294         struct priority_group *pg;
1295         unsigned pgnum;
1296         unsigned long flags;
1297         char dummy;
1298
1299         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1300             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1301                 DMWARN("invalid PG number supplied to switch_pg_num");
1302                 return -EINVAL;
1303         }
1304
1305         spin_lock_irqsave(&m->lock, flags);
1306         list_for_each_entry(pg, &m->priority_groups, list) {
1307                 pg->bypassed = false;
1308                 if (--pgnum)
1309                         continue;
1310
1311                 m->current_pgpath = NULL;
1312                 m->current_pg = NULL;
1313                 m->next_pg = pg;
1314         }
1315         spin_unlock_irqrestore(&m->lock, flags);
1316
1317         schedule_work(&m->trigger_event);
1318         return 0;
1319 }
1320
1321 /*
1322  * Set/clear bypassed status of a PG.
1323  * PGs are numbered upwards from 1 in the order they were declared.
1324  */
1325 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1326 {
1327         struct priority_group *pg;
1328         unsigned pgnum;
1329         char dummy;
1330
1331         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1332             !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1333                 DMWARN("invalid PG number supplied to bypass_pg");
1334                 return -EINVAL;
1335         }
1336
1337         list_for_each_entry(pg, &m->priority_groups, list) {
1338                 if (!--pgnum)
1339                         break;
1340         }
1341
1342         bypass_pg(m, pg, bypassed);
1343         return 0;
1344 }
1345
1346 /*
1347  * Should we retry pg_init immediately?
1348  */
1349 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1350 {
1351         unsigned long flags;
1352         bool limit_reached = false;
1353
1354         spin_lock_irqsave(&m->lock, flags);
1355
1356         if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1357             !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1358                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1359         else
1360                 limit_reached = true;
1361
1362         spin_unlock_irqrestore(&m->lock, flags);
1363
1364         return limit_reached;
1365 }
1366
1367 static void pg_init_done(void *data, int errors)
1368 {
1369         struct pgpath *pgpath = data;
1370         struct priority_group *pg = pgpath->pg;
1371         struct multipath *m = pg->m;
1372         unsigned long flags;
1373         bool delay_retry = false;
1374
1375         /* device or driver problems */
1376         switch (errors) {
1377         case SCSI_DH_OK:
1378                 break;
1379         case SCSI_DH_NOSYS:
1380                 if (!m->hw_handler_name) {
1381                         errors = 0;
1382                         break;
1383                 }
1384                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1385                       "Error %d.", m->hw_handler_name, errors);
1386                 /*
1387                  * Fail path for now, so we do not ping pong
1388                  */
1389                 fail_path(pgpath);
1390                 break;
1391         case SCSI_DH_DEV_TEMP_BUSY:
1392                 /*
1393                  * Probably doing something like FW upgrade on the
1394                  * controller so try the other pg.
1395                  */
1396                 bypass_pg(m, pg, true);
1397                 break;
1398         case SCSI_DH_RETRY:
1399                 /* Wait before retrying. */
1400                 delay_retry = 1;
1401         case SCSI_DH_IMM_RETRY:
1402         case SCSI_DH_RES_TEMP_UNAVAIL:
1403                 if (pg_init_limit_reached(m, pgpath))
1404                         fail_path(pgpath);
1405                 errors = 0;
1406                 break;
1407         case SCSI_DH_DEV_OFFLINED:
1408         default:
1409                 /*
1410                  * We probably do not want to fail the path for a device
1411                  * error, but this is what the old dm did. In future
1412                  * patches we can do more advanced handling.
1413                  */
1414                 fail_path(pgpath);
1415         }
1416
1417         spin_lock_irqsave(&m->lock, flags);
1418         if (errors) {
1419                 if (pgpath == m->current_pgpath) {
1420                         DMERR("Could not failover device. Error %d.", errors);
1421                         m->current_pgpath = NULL;
1422                         m->current_pg = NULL;
1423                 }
1424         } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1425                 pg->bypassed = false;
1426
1427         if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1428                 /* Activations of other paths are still on going */
1429                 goto out;
1430
1431         if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1432                 if (delay_retry)
1433                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1434                 else
1435                         clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1436
1437                 if (__pg_init_all_paths(m))
1438                         goto out;
1439         }
1440         clear_bit(MPATHF_QUEUE_IO, &m->flags);
1441
1442         process_queued_io_list(m);
1443
1444         /*
1445          * Wake up any thread waiting to suspend.
1446          */
1447         wake_up(&m->pg_init_wait);
1448
1449 out:
1450         spin_unlock_irqrestore(&m->lock, flags);
1451 }
1452
1453 static void activate_or_offline_path(struct pgpath *pgpath)
1454 {
1455         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1456
1457         if (pgpath->is_active && !blk_queue_dying(q))
1458                 scsi_dh_activate(q, pg_init_done, pgpath);
1459         else
1460                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1461 }
1462
1463 static void activate_path_work(struct work_struct *work)
1464 {
1465         struct pgpath *pgpath =
1466                 container_of(work, struct pgpath, activate_path.work);
1467
1468         activate_or_offline_path(pgpath);
1469 }
1470
1471 static int noretry_error(int error)
1472 {
1473         switch (error) {
1474         case -EBADE:
1475                 /*
1476                  * EBADE signals an reservation conflict.
1477                  * We shouldn't fail the path here as we can communicate with
1478                  * the target.  We should failover to the next path, but in
1479                  * doing so we might be causing a ping-pong between paths.
1480                  * So just return the reservation conflict error.
1481                  */
1482         case -EOPNOTSUPP:
1483         case -EREMOTEIO:
1484         case -EILSEQ:
1485         case -ENODATA:
1486         case -ENOSPC:
1487                 return 1;
1488         }
1489
1490         /* Anything else could be a path failure, so should be retried */
1491         return 0;
1492 }
1493
1494 /*
1495  * end_io handling
1496  */
1497 static int do_end_io(struct multipath *m, struct request *clone,
1498                      int error, struct dm_mpath_io *mpio)
1499 {
1500         /*
1501          * We don't queue any clone request inside the multipath target
1502          * during end I/O handling, since those clone requests don't have
1503          * bio clones.  If we queue them inside the multipath target,
1504          * we need to make bio clones, that requires memory allocation.
1505          * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1506          *  don't have bio clones.)
1507          * Instead of queueing the clone request here, we queue the original
1508          * request into dm core, which will remake a clone request and
1509          * clone bios for it and resubmit it later.
1510          */
1511         int r = DM_ENDIO_REQUEUE;
1512
1513         if (!error && !clone->errors)
1514                 return 0;       /* I/O complete */
1515
1516         if (noretry_error(error))
1517                 return error;
1518
1519         if (mpio->pgpath)
1520                 fail_path(mpio->pgpath);
1521
1522         if (!atomic_read(&m->nr_valid_paths)) {
1523                 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1524                         if (!must_push_back_rq(m))
1525                                 r = -EIO;
1526                 }
1527         }
1528
1529         return r;
1530 }
1531
1532 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1533                             int error, union map_info *map_context)
1534 {
1535         struct multipath *m = ti->private;
1536         struct dm_mpath_io *mpio = get_mpio(map_context);
1537         struct pgpath *pgpath;
1538         struct path_selector *ps;
1539         int r;
1540
1541         BUG_ON(!mpio);
1542
1543         r = do_end_io(m, clone, error, mpio);
1544         pgpath = mpio->pgpath;
1545         if (pgpath) {
1546                 ps = &pgpath->pg->ps;
1547                 if (ps->type->end_io)
1548                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1549         }
1550
1551         return r;
1552 }
1553
1554 static int do_end_io_bio(struct multipath *m, struct bio *clone,
1555                          int error, struct dm_mpath_io *mpio)
1556 {
1557         unsigned long flags;
1558
1559         if (!error)
1560                 return 0;       /* I/O complete */
1561
1562         if (noretry_error(error))
1563                 return error;
1564
1565         if (mpio->pgpath)
1566                 fail_path(mpio->pgpath);
1567
1568         if (!atomic_read(&m->nr_valid_paths)) {
1569                 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1570                         if (!must_push_back_bio(m))
1571                                 return -EIO;
1572                         return DM_ENDIO_REQUEUE;
1573                 }
1574         }
1575
1576         /* Queue for the daemon to resubmit */
1577         dm_bio_restore(get_bio_details_from_bio(clone), clone);
1578
1579         spin_lock_irqsave(&m->lock, flags);
1580         bio_list_add(&m->queued_bios, clone);
1581         spin_unlock_irqrestore(&m->lock, flags);
1582         if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
1583                 queue_work(kmultipathd, &m->process_queued_bios);
1584
1585         return DM_ENDIO_INCOMPLETE;
1586 }
1587
1588 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
1589 {
1590         struct multipath *m = ti->private;
1591         struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1592         struct pgpath *pgpath;
1593         struct path_selector *ps;
1594         int r;
1595
1596         BUG_ON(!mpio);
1597
1598         r = do_end_io_bio(m, clone, error, mpio);
1599         pgpath = mpio->pgpath;
1600         if (pgpath) {
1601                 ps = &pgpath->pg->ps;
1602                 if (ps->type->end_io)
1603                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1604         }
1605
1606         return r;
1607 }
1608
1609 /*
1610  * Suspend can't complete until all the I/O is processed so if
1611  * the last path fails we must error any remaining I/O.
1612  * Note that if the freeze_bdev fails while suspending, the
1613  * queue_if_no_path state is lost - userspace should reset it.
1614  */
1615 static void multipath_presuspend(struct dm_target *ti)
1616 {
1617         struct multipath *m = ti->private;
1618
1619         queue_if_no_path(m, false, true);
1620 }
1621
1622 static void multipath_postsuspend(struct dm_target *ti)
1623 {
1624         struct multipath *m = ti->private;
1625
1626         mutex_lock(&m->work_mutex);
1627         flush_multipath_work(m);
1628         mutex_unlock(&m->work_mutex);
1629 }
1630
1631 /*
1632  * Restore the queue_if_no_path setting.
1633  */
1634 static void multipath_resume(struct dm_target *ti)
1635 {
1636         struct multipath *m = ti->private;
1637         unsigned long flags;
1638
1639         spin_lock_irqsave(&m->lock, flags);
1640         if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
1641                 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1642         else
1643                 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1644         spin_unlock_irqrestore(&m->lock, flags);
1645 }
1646
1647 /*
1648  * Info output has the following format:
1649  * num_multipath_feature_args [multipath_feature_args]*
1650  * num_handler_status_args [handler_status_args]*
1651  * num_groups init_group_number
1652  *            [A|D|E num_ps_status_args [ps_status_args]*
1653  *             num_paths num_selector_args
1654  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1655  *
1656  * Table output has the following format (identical to the constructor string):
1657  * num_feature_args [features_args]*
1658  * num_handler_args hw_handler [hw_handler_args]*
1659  * num_groups init_group_number
1660  *     [priority selector-name num_ps_args [ps_args]*
1661  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1662  */
1663 static void multipath_status(struct dm_target *ti, status_type_t type,
1664                              unsigned status_flags, char *result, unsigned maxlen)
1665 {
1666         int sz = 0;
1667         unsigned long flags;
1668         struct multipath *m = ti->private;
1669         struct priority_group *pg;
1670         struct pgpath *p;
1671         unsigned pg_num;
1672         char state;
1673
1674         spin_lock_irqsave(&m->lock, flags);
1675
1676         /* Features */
1677         if (type == STATUSTYPE_INFO)
1678                 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1679                        atomic_read(&m->pg_init_count));
1680         else {
1681                 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1682                               (m->pg_init_retries > 0) * 2 +
1683                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1684                               test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
1685                               (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
1686
1687                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1688                         DMEMIT("queue_if_no_path ");
1689                 if (m->pg_init_retries)
1690                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1691                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1692                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1693                 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1694                         DMEMIT("retain_attached_hw_handler ");
1695                 if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
1696                         switch(m->queue_mode) {
1697                         case DM_TYPE_BIO_BASED:
1698                                 DMEMIT("queue_mode bio ");
1699                                 break;
1700                         case DM_TYPE_MQ_REQUEST_BASED:
1701                                 DMEMIT("queue_mode mq ");
1702                                 break;
1703                         default:
1704                                 WARN_ON_ONCE(true);
1705                                 break;
1706                         }
1707                 }
1708         }
1709
1710         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1711                 DMEMIT("0 ");
1712         else
1713                 DMEMIT("1 %s ", m->hw_handler_name);
1714
1715         DMEMIT("%u ", m->nr_priority_groups);
1716
1717         if (m->next_pg)
1718                 pg_num = m->next_pg->pg_num;
1719         else if (m->current_pg)
1720                 pg_num = m->current_pg->pg_num;
1721         else
1722                 pg_num = (m->nr_priority_groups ? 1 : 0);
1723
1724         DMEMIT("%u ", pg_num);
1725
1726         switch (type) {
1727         case STATUSTYPE_INFO:
1728                 list_for_each_entry(pg, &m->priority_groups, list) {
1729                         if (pg->bypassed)
1730                                 state = 'D';    /* Disabled */
1731                         else if (pg == m->current_pg)
1732                                 state = 'A';    /* Currently Active */
1733                         else
1734                                 state = 'E';    /* Enabled */
1735
1736                         DMEMIT("%c ", state);
1737
1738                         if (pg->ps.type->status)
1739                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1740                                                           result + sz,
1741                                                           maxlen - sz);
1742                         else
1743                                 DMEMIT("0 ");
1744
1745                         DMEMIT("%u %u ", pg->nr_pgpaths,
1746                                pg->ps.type->info_args);
1747
1748                         list_for_each_entry(p, &pg->pgpaths, list) {
1749                                 DMEMIT("%s %s %u ", p->path.dev->name,
1750                                        p->is_active ? "A" : "F",
1751                                        p->fail_count);
1752                                 if (pg->ps.type->status)
1753                                         sz += pg->ps.type->status(&pg->ps,
1754                                               &p->path, type, result + sz,
1755                                               maxlen - sz);
1756                         }
1757                 }
1758                 break;
1759
1760         case STATUSTYPE_TABLE:
1761                 list_for_each_entry(pg, &m->priority_groups, list) {
1762                         DMEMIT("%s ", pg->ps.type->name);
1763
1764                         if (pg->ps.type->status)
1765                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1766                                                           result + sz,
1767                                                           maxlen - sz);
1768                         else
1769                                 DMEMIT("0 ");
1770
1771                         DMEMIT("%u %u ", pg->nr_pgpaths,
1772                                pg->ps.type->table_args);
1773
1774                         list_for_each_entry(p, &pg->pgpaths, list) {
1775                                 DMEMIT("%s ", p->path.dev->name);
1776                                 if (pg->ps.type->status)
1777                                         sz += pg->ps.type->status(&pg->ps,
1778                                               &p->path, type, result + sz,
1779                                               maxlen - sz);
1780                         }
1781                 }
1782                 break;
1783         }
1784
1785         spin_unlock_irqrestore(&m->lock, flags);
1786 }
1787
1788 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1789 {
1790         int r = -EINVAL;
1791         struct dm_dev *dev;
1792         struct multipath *m = ti->private;
1793         action_fn action;
1794
1795         mutex_lock(&m->work_mutex);
1796
1797         if (dm_suspended(ti)) {
1798                 r = -EBUSY;
1799                 goto out;
1800         }
1801
1802         if (argc == 1) {
1803                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1804                         r = queue_if_no_path(m, true, false);
1805                         goto out;
1806                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1807                         r = queue_if_no_path(m, false, false);
1808                         goto out;
1809                 }
1810         }
1811
1812         if (argc != 2) {
1813                 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1814                 goto out;
1815         }
1816
1817         if (!strcasecmp(argv[0], "disable_group")) {
1818                 r = bypass_pg_num(m, argv[1], true);
1819                 goto out;
1820         } else if (!strcasecmp(argv[0], "enable_group")) {
1821                 r = bypass_pg_num(m, argv[1], false);
1822                 goto out;
1823         } else if (!strcasecmp(argv[0], "switch_group")) {
1824                 r = switch_pg_num(m, argv[1]);
1825                 goto out;
1826         } else if (!strcasecmp(argv[0], "reinstate_path"))
1827                 action = reinstate_path;
1828         else if (!strcasecmp(argv[0], "fail_path"))
1829                 action = fail_path;
1830         else {
1831                 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1832                 goto out;
1833         }
1834
1835         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1836         if (r) {
1837                 DMWARN("message: error getting device %s",
1838                        argv[1]);
1839                 goto out;
1840         }
1841
1842         r = action_dev(m, dev, action);
1843
1844         dm_put_device(ti, dev);
1845
1846 out:
1847         mutex_unlock(&m->work_mutex);
1848         return r;
1849 }
1850
1851 static int multipath_prepare_ioctl(struct dm_target *ti,
1852                 struct block_device **bdev, fmode_t *mode)
1853 {
1854         struct multipath *m = ti->private;
1855         struct pgpath *current_pgpath;
1856         int r;
1857
1858         current_pgpath = lockless_dereference(m->current_pgpath);
1859         if (!current_pgpath)
1860                 current_pgpath = choose_pgpath(m, 0);
1861
1862         if (current_pgpath) {
1863                 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1864                         *bdev = current_pgpath->path.dev->bdev;
1865                         *mode = current_pgpath->path.dev->mode;
1866                         r = 0;
1867                 } else {
1868                         /* pg_init has not started or completed */
1869                         r = -ENOTCONN;
1870                 }
1871         } else {
1872                 /* No path is available */
1873                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1874                         r = -ENOTCONN;
1875                 else
1876                         r = -EIO;
1877         }
1878
1879         if (r == -ENOTCONN) {
1880                 if (!lockless_dereference(m->current_pg)) {
1881                         /* Path status changed, redo selection */
1882                         (void) choose_pgpath(m, 0);
1883                 }
1884                 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1885                         pg_init_all_paths(m);
1886                 dm_table_run_md_queue_async(m->ti->table);
1887                 process_queued_io_list(m);
1888         }
1889
1890         /*
1891          * Only pass ioctls through if the device sizes match exactly.
1892          */
1893         if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1894                 return 1;
1895         return r;
1896 }
1897
1898 static int multipath_iterate_devices(struct dm_target *ti,
1899                                      iterate_devices_callout_fn fn, void *data)
1900 {
1901         struct multipath *m = ti->private;
1902         struct priority_group *pg;
1903         struct pgpath *p;
1904         int ret = 0;
1905
1906         list_for_each_entry(pg, &m->priority_groups, list) {
1907                 list_for_each_entry(p, &pg->pgpaths, list) {
1908                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1909                         if (ret)
1910                                 goto out;
1911                 }
1912         }
1913
1914 out:
1915         return ret;
1916 }
1917
1918 static int pgpath_busy(struct pgpath *pgpath)
1919 {
1920         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1921
1922         return blk_lld_busy(q);
1923 }
1924
1925 /*
1926  * We return "busy", only when we can map I/Os but underlying devices
1927  * are busy (so even if we map I/Os now, the I/Os will wait on
1928  * the underlying queue).
1929  * In other words, if we want to kill I/Os or queue them inside us
1930  * due to map unavailability, we don't return "busy".  Otherwise,
1931  * dm core won't give us the I/Os and we can't do what we want.
1932  */
1933 static int multipath_busy(struct dm_target *ti)
1934 {
1935         bool busy = false, has_active = false;
1936         struct multipath *m = ti->private;
1937         struct priority_group *pg, *next_pg;
1938         struct pgpath *pgpath;
1939
1940         /* pg_init in progress */
1941         if (atomic_read(&m->pg_init_in_progress))
1942                 return true;
1943
1944         /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
1945         if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1946                 return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
1947
1948         /* Guess which priority_group will be used at next mapping time */
1949         pg = lockless_dereference(m->current_pg);
1950         next_pg = lockless_dereference(m->next_pg);
1951         if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
1952                 pg = next_pg;
1953
1954         if (!pg) {
1955                 /*
1956                  * We don't know which pg will be used at next mapping time.
1957                  * We don't call choose_pgpath() here to avoid to trigger
1958                  * pg_init just by busy checking.
1959                  * So we don't know whether underlying devices we will be using
1960                  * at next mapping time are busy or not. Just try mapping.
1961                  */
1962                 return busy;
1963         }
1964
1965         /*
1966          * If there is one non-busy active path at least, the path selector
1967          * will be able to select it. So we consider such a pg as not busy.
1968          */
1969         busy = true;
1970         list_for_each_entry(pgpath, &pg->pgpaths, list) {
1971                 if (pgpath->is_active) {
1972                         has_active = true;
1973                         if (!pgpath_busy(pgpath)) {
1974                                 busy = false;
1975                                 break;
1976                         }
1977                 }
1978         }
1979
1980         if (!has_active) {
1981                 /*
1982                  * No active path in this pg, so this pg won't be used and
1983                  * the current_pg will be changed at next mapping time.
1984                  * We need to try mapping to determine it.
1985                  */
1986                 busy = false;
1987         }
1988
1989         return busy;
1990 }
1991
1992 /*-----------------------------------------------------------------
1993  * Module setup
1994  *---------------------------------------------------------------*/
1995 static struct target_type multipath_target = {
1996         .name = "multipath",
1997         .version = {1, 12, 0},
1998         .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
1999         .module = THIS_MODULE,
2000         .ctr = multipath_ctr,
2001         .dtr = multipath_dtr,
2002         .clone_and_map_rq = multipath_clone_and_map,
2003         .release_clone_rq = multipath_release_clone,
2004         .rq_end_io = multipath_end_io,
2005         .map = multipath_map_bio,
2006         .end_io = multipath_end_io_bio,
2007         .presuspend = multipath_presuspend,
2008         .postsuspend = multipath_postsuspend,
2009         .resume = multipath_resume,
2010         .status = multipath_status,
2011         .message = multipath_message,
2012         .prepare_ioctl = multipath_prepare_ioctl,
2013         .iterate_devices = multipath_iterate_devices,
2014         .busy = multipath_busy,
2015 };
2016
2017 static int __init dm_multipath_init(void)
2018 {
2019         int r;
2020
2021         r = dm_register_target(&multipath_target);
2022         if (r < 0) {
2023                 DMERR("request-based register failed %d", r);
2024                 r = -EINVAL;
2025                 goto bad_register_target;
2026         }
2027
2028         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2029         if (!kmultipathd) {
2030                 DMERR("failed to create workqueue kmpathd");
2031                 r = -ENOMEM;
2032                 goto bad_alloc_kmultipathd;
2033         }
2034
2035         /*
2036          * A separate workqueue is used to handle the device handlers
2037          * to avoid overloading existing workqueue. Overloading the
2038          * old workqueue would also create a bottleneck in the
2039          * path of the storage hardware device activation.
2040          */
2041         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2042                                                   WQ_MEM_RECLAIM);
2043         if (!kmpath_handlerd) {
2044                 DMERR("failed to create workqueue kmpath_handlerd");
2045                 r = -ENOMEM;
2046                 goto bad_alloc_kmpath_handlerd;
2047         }
2048
2049         return 0;
2050
2051 bad_alloc_kmpath_handlerd:
2052         destroy_workqueue(kmultipathd);
2053 bad_alloc_kmultipathd:
2054         dm_unregister_target(&multipath_target);
2055 bad_register_target:
2056         return r;
2057 }
2058
2059 static void __exit dm_multipath_exit(void)
2060 {
2061         destroy_workqueue(kmpath_handlerd);
2062         destroy_workqueue(kmultipathd);
2063
2064         dm_unregister_target(&multipath_target);
2065 }
2066
2067 module_init(dm_multipath_init);
2068 module_exit(dm_multipath_exit);
2069
2070 MODULE_DESCRIPTION(DM_NAME " multipath target");
2071 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2072 MODULE_LICENSE("GPL");