4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
53 #include <linux/drbd_limits.h>
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
59 struct after_state_chg_work {
63 enum chg_state_flags flags;
64 struct completion *done;
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82 static void _tl_clear(struct drbd_conf *mdev);
84 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87 MODULE_VERSION(REL_VERSION);
88 MODULE_LICENSE("GPL");
89 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
91 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
93 #include <linux/moduleparam.h>
94 /* allow_open_on_secondary */
95 MODULE_PARM_DESC(allow_oos, "DONT USE!");
96 /* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98 module_param(minor_count, uint, 0444);
99 module_param(disable_sendpage, bool, 0644);
100 module_param(allow_oos, bool, 0);
101 module_param(cn_idx, uint, 0444);
102 module_param(proc_details, int, 0644);
104 #ifdef CONFIG_DRBD_FAULT_INJECTION
107 static int fault_count;
109 /* bitmap of enabled faults */
110 module_param(enable_faults, int, 0664);
111 /* fault rate % value - applies to all enabled faults */
112 module_param(fault_rate, int, 0664);
113 /* count of faults inserted */
114 module_param(fault_count, int, 0664);
115 /* bitmap of devices to insert faults on */
116 module_param(fault_devs, int, 0644);
119 /* module parameter, defined */
120 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
121 bool disable_sendpage;
123 unsigned int cn_idx = CN_IDX_DRBD;
124 int proc_details; /* Detail level in proc drbd*/
126 /* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128 char usermode_helper[80] = "/sbin/drbdadm";
130 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
132 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
135 struct drbd_conf **minor_table;
137 struct kmem_cache *drbd_request_cache;
138 struct kmem_cache *drbd_ee_cache; /* epoch entries */
139 struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140 struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141 mempool_t *drbd_request_mempool;
142 mempool_t *drbd_ee_mempool;
143 mempool_t *drbd_md_io_page_pool;
144 struct bio_set *drbd_md_io_bio_set;
146 /* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
152 struct page *drbd_pp_pool;
153 spinlock_t drbd_pp_lock;
155 wait_queue_head_t drbd_pp_wait;
157 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
159 static const struct block_device_operations drbd_ops = {
160 .owner = THIS_MODULE,
162 .release = drbd_release,
165 static void bio_destructor_drbd(struct bio *bio)
167 bio_free(bio, drbd_md_io_bio_set);
170 struct bio *bio_alloc_drbd(gfp_t gfp_mask)
174 if (!drbd_md_io_bio_set)
175 return bio_alloc(gfp_mask, 1);
177 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
180 bio->bi_destructor = bio_destructor_drbd;
185 /* When checking with sparse, and this is an inline function, sparse will
186 give tons of false positives. When this is a real functions sparse works.
188 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
192 atomic_inc(&mdev->local_cnt);
193 io_allowed = (mdev->state.disk >= mins);
195 if (atomic_dec_and_test(&mdev->local_cnt))
196 wake_up(&mdev->misc_wait);
204 * DOC: The transfer log
206 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
207 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
208 * of the list. There is always at least one &struct drbd_tl_epoch object.
210 * Each &struct drbd_tl_epoch has a circular double linked list of requests
213 static int tl_init(struct drbd_conf *mdev)
215 struct drbd_tl_epoch *b;
217 /* during device minor initialization, we may well use GFP_KERNEL */
218 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
221 INIT_LIST_HEAD(&b->requests);
222 INIT_LIST_HEAD(&b->w.list);
226 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
228 mdev->oldest_tle = b;
229 mdev->newest_tle = b;
230 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
231 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
233 mdev->tl_hash = NULL;
239 static void tl_cleanup(struct drbd_conf *mdev)
241 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
242 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
243 kfree(mdev->oldest_tle);
244 mdev->oldest_tle = NULL;
245 kfree(mdev->unused_spare_tle);
246 mdev->unused_spare_tle = NULL;
247 kfree(mdev->tl_hash);
248 mdev->tl_hash = NULL;
253 * _tl_add_barrier() - Adds a barrier to the transfer log
254 * @mdev: DRBD device.
255 * @new: Barrier to be added before the current head of the TL.
257 * The caller must hold the req_lock.
259 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
261 struct drbd_tl_epoch *newest_before;
263 INIT_LIST_HEAD(&new->requests);
264 INIT_LIST_HEAD(&new->w.list);
265 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
269 newest_before = mdev->newest_tle;
270 new->br_number = newest_before->br_number+1;
271 if (mdev->newest_tle != new) {
272 mdev->newest_tle->next = new;
273 mdev->newest_tle = new;
278 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
279 * @mdev: DRBD device.
280 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
281 * @set_size: Expected number of requests before that barrier.
283 * In case the passed barrier_nr or set_size does not match the oldest
284 * &struct drbd_tl_epoch objects this function will cause a termination
287 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
288 unsigned int set_size)
290 struct drbd_tl_epoch *b, *nob; /* next old barrier */
291 struct list_head *le, *tle;
292 struct drbd_request *r;
294 spin_lock_irq(&mdev->req_lock);
296 b = mdev->oldest_tle;
298 /* first some paranoia code */
300 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
304 if (b->br_number != barrier_nr) {
305 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
306 barrier_nr, b->br_number);
309 if (b->n_writes != set_size) {
310 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
311 barrier_nr, set_size, b->n_writes);
315 /* Clean up list of requests processed during current epoch */
316 list_for_each_safe(le, tle, &b->requests) {
317 r = list_entry(le, struct drbd_request, tl_requests);
318 _req_mod(r, barrier_acked);
320 /* There could be requests on the list waiting for completion
321 of the write to the local disk. To avoid corruptions of
322 slab's data structures we have to remove the lists head.
324 Also there could have been a barrier ack out of sequence, overtaking
325 the write acks - which would be a bug and violating write ordering.
326 To not deadlock in case we lose connection while such requests are
327 still pending, we need some way to find them for the
328 _req_mode(connection_lost_while_pending).
330 These have been list_move'd to the out_of_sequence_requests list in
331 _req_mod(, barrier_acked) above.
333 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
336 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
337 _tl_add_barrier(mdev, b);
339 mdev->oldest_tle = nob;
340 /* if nob == NULL b was the only barrier, and becomes the new
341 barrier. Therefore mdev->oldest_tle points already to b */
343 D_ASSERT(nob != NULL);
344 mdev->oldest_tle = nob;
348 spin_unlock_irq(&mdev->req_lock);
349 dec_ap_pending(mdev);
354 spin_unlock_irq(&mdev->req_lock);
355 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
360 * _tl_restart() - Walks the transfer log, and applies an action to all requests
361 * @mdev: DRBD device.
362 * @what: The action/event to perform with all request objects
364 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
365 * restart_frozen_disk_io.
367 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
369 struct drbd_tl_epoch *b, *tmp, **pn;
370 struct list_head *le, *tle, carry_reads;
371 struct drbd_request *req;
372 int rv, n_writes, n_reads;
374 b = mdev->oldest_tle;
375 pn = &mdev->oldest_tle;
379 INIT_LIST_HEAD(&carry_reads);
380 list_for_each_safe(le, tle, &b->requests) {
381 req = list_entry(le, struct drbd_request, tl_requests);
382 rv = _req_mod(req, what);
384 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
385 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
390 if (what == resend) {
391 b->n_writes = n_writes;
392 if (b->w.cb == NULL) {
393 b->w.cb = w_send_barrier;
394 inc_ap_pending(mdev);
395 set_bit(CREATE_BARRIER, &mdev->flags);
398 drbd_queue_work(&mdev->data.work, &b->w);
403 list_add(&carry_reads, &b->requests);
404 /* there could still be requests on that ring list,
405 * in case local io is still pending */
406 list_del(&b->requests);
408 /* dec_ap_pending corresponding to queue_barrier.
409 * the newest barrier may not have been queued yet,
410 * in which case w.cb is still NULL. */
412 dec_ap_pending(mdev);
414 if (b == mdev->newest_tle) {
415 /* recycle, but reinit! */
416 D_ASSERT(tmp == NULL);
417 INIT_LIST_HEAD(&b->requests);
418 list_splice(&carry_reads, &b->requests);
419 INIT_LIST_HEAD(&b->w.list);
421 b->br_number = net_random();
431 list_splice(&carry_reads, &b->requests);
434 /* Actions operating on the disk state, also want to work on
435 requests that got barrier acked. */
437 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
438 req = list_entry(le, struct drbd_request, tl_requests);
445 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
446 * @mdev: DRBD device.
448 * This is called after the connection to the peer was lost. The storage covered
449 * by the requests on the transfer gets marked as our of sync. Called from the
450 * receiver thread and the worker thread.
452 void tl_clear(struct drbd_conf *mdev)
454 spin_lock_irq(&mdev->req_lock);
456 spin_unlock_irq(&mdev->req_lock);
459 static void _tl_clear(struct drbd_conf *mdev)
461 struct list_head *le, *tle;
462 struct drbd_request *r;
464 _tl_restart(mdev, connection_lost_while_pending);
466 /* we expect this list to be empty. */
467 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
469 /* but just in case, clean it up anyways! */
470 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
471 r = list_entry(le, struct drbd_request, tl_requests);
472 /* It would be nice to complete outside of spinlock.
473 * But this is easier for now. */
474 _req_mod(r, connection_lost_while_pending);
477 /* ensure bit indicating barrier is required is clear */
478 clear_bit(CREATE_BARRIER, &mdev->flags);
480 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
484 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
486 spin_lock_irq(&mdev->req_lock);
487 _tl_restart(mdev, what);
488 spin_unlock_irq(&mdev->req_lock);
492 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
493 * @mdev: DRBD device.
495 void tl_abort_disk_io(struct drbd_conf *mdev)
497 struct drbd_tl_epoch *b;
498 struct list_head *le, *tle;
499 struct drbd_request *req;
501 spin_lock_irq(&mdev->req_lock);
502 b = mdev->oldest_tle;
504 list_for_each_safe(le, tle, &b->requests) {
505 req = list_entry(le, struct drbd_request, tl_requests);
506 if (!(req->rq_state & RQ_LOCAL_PENDING))
508 _req_mod(req, abort_disk_io);
513 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
514 req = list_entry(le, struct drbd_request, tl_requests);
515 if (!(req->rq_state & RQ_LOCAL_PENDING))
517 _req_mod(req, abort_disk_io);
520 spin_unlock_irq(&mdev->req_lock);
524 * cl_wide_st_chg() - true if the state change is a cluster wide one
525 * @mdev: DRBD device.
526 * @os: old (current) state.
527 * @ns: new (wanted) state.
529 static int cl_wide_st_chg(struct drbd_conf *mdev,
530 union drbd_state os, union drbd_state ns)
532 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
533 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
534 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
535 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
536 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
537 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
538 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
542 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
543 union drbd_state mask, union drbd_state val)
546 union drbd_state os, ns;
547 enum drbd_state_rv rv;
549 spin_lock_irqsave(&mdev->req_lock, flags);
551 ns.i = (os.i & ~mask.i) | val.i;
552 rv = _drbd_set_state(mdev, ns, f, NULL);
554 spin_unlock_irqrestore(&mdev->req_lock, flags);
560 * drbd_force_state() - Impose a change which happens outside our control on our state
561 * @mdev: DRBD device.
562 * @mask: mask of state bits to change.
563 * @val: value of new state bits.
565 void drbd_force_state(struct drbd_conf *mdev,
566 union drbd_state mask, union drbd_state val)
568 drbd_change_state(mdev, CS_HARD, mask, val);
571 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
572 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
575 enum sanitize_state_warnings {
577 ABORTED_ONLINE_VERIFY,
579 CONNECTION_LOST_NEGOTIATING,
580 IMPLICITLY_UPGRADED_DISK,
581 IMPLICITLY_UPGRADED_PDSK,
583 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
584 union drbd_state ns, enum sanitize_state_warnings *warn);
585 int drbd_send_state_req(struct drbd_conf *,
586 union drbd_state, union drbd_state);
588 static enum drbd_state_rv
589 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
590 union drbd_state val)
592 union drbd_state os, ns;
594 enum drbd_state_rv rv;
596 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
597 return SS_CW_SUCCESS;
599 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
600 return SS_CW_FAILED_BY_PEER;
603 spin_lock_irqsave(&mdev->req_lock, flags);
605 ns.i = (os.i & ~mask.i) | val.i;
606 ns = sanitize_state(mdev, os, ns, NULL);
608 if (!cl_wide_st_chg(mdev, os, ns))
611 rv = is_valid_state(mdev, ns);
612 if (rv == SS_SUCCESS) {
613 rv = is_valid_state_transition(mdev, ns, os);
614 if (rv == SS_SUCCESS)
615 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
618 spin_unlock_irqrestore(&mdev->req_lock, flags);
624 * drbd_req_state() - Perform an eventually cluster wide state change
625 * @mdev: DRBD device.
626 * @mask: mask of state bits to change.
627 * @val: value of new state bits.
630 * Should not be called directly, use drbd_request_state() or
631 * _drbd_request_state().
633 static enum drbd_state_rv
634 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
635 union drbd_state val, enum chg_state_flags f)
637 struct completion done;
639 union drbd_state os, ns;
640 enum drbd_state_rv rv;
642 init_completion(&done);
644 if (f & CS_SERIALIZE)
645 mutex_lock(&mdev->state_mutex);
647 spin_lock_irqsave(&mdev->req_lock, flags);
649 ns.i = (os.i & ~mask.i) | val.i;
650 ns = sanitize_state(mdev, os, ns, NULL);
652 if (cl_wide_st_chg(mdev, os, ns)) {
653 rv = is_valid_state(mdev, ns);
654 if (rv == SS_SUCCESS)
655 rv = is_valid_state_transition(mdev, ns, os);
656 spin_unlock_irqrestore(&mdev->req_lock, flags);
658 if (rv < SS_SUCCESS) {
660 print_st_err(mdev, os, ns, rv);
664 drbd_state_lock(mdev);
665 if (!drbd_send_state_req(mdev, mask, val)) {
666 drbd_state_unlock(mdev);
667 rv = SS_CW_FAILED_BY_PEER;
669 print_st_err(mdev, os, ns, rv);
673 wait_event(mdev->state_wait,
674 (rv = _req_st_cond(mdev, mask, val)));
676 if (rv < SS_SUCCESS) {
677 drbd_state_unlock(mdev);
679 print_st_err(mdev, os, ns, rv);
682 spin_lock_irqsave(&mdev->req_lock, flags);
684 ns.i = (os.i & ~mask.i) | val.i;
685 rv = _drbd_set_state(mdev, ns, f, &done);
686 drbd_state_unlock(mdev);
688 rv = _drbd_set_state(mdev, ns, f, &done);
691 spin_unlock_irqrestore(&mdev->req_lock, flags);
693 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
694 D_ASSERT(current != mdev->worker.task);
695 wait_for_completion(&done);
699 if (f & CS_SERIALIZE)
700 mutex_unlock(&mdev->state_mutex);
706 * _drbd_request_state() - Request a state change (with flags)
707 * @mdev: DRBD device.
708 * @mask: mask of state bits to change.
709 * @val: value of new state bits.
712 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
713 * flag, or when logging of failed state change requests is not desired.
716 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
717 union drbd_state val, enum chg_state_flags f)
719 enum drbd_state_rv rv;
721 wait_event(mdev->state_wait,
722 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
727 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
729 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
731 drbd_conn_str(ns.conn),
732 drbd_role_str(ns.role),
733 drbd_role_str(ns.peer),
734 drbd_disk_str(ns.disk),
735 drbd_disk_str(ns.pdsk),
736 is_susp(ns) ? 's' : 'r',
737 ns.aftr_isp ? 'a' : '-',
738 ns.peer_isp ? 'p' : '-',
739 ns.user_isp ? 'u' : '-'
743 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
744 union drbd_state ns, enum drbd_state_rv err)
746 if (err == SS_IN_TRANSIENT_STATE)
748 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
749 print_st(mdev, " state", os);
750 print_st(mdev, "wanted", ns);
755 * is_valid_state() - Returns an SS_ error code if ns is not valid
756 * @mdev: DRBD device.
757 * @ns: State to consider.
759 static enum drbd_state_rv
760 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
762 /* See drbd_state_sw_errors in drbd_strings.c */
764 enum drbd_fencing_p fp;
765 enum drbd_state_rv rv = SS_SUCCESS;
768 if (get_ldev(mdev)) {
769 fp = mdev->ldev->dc.fencing;
773 if (get_net_conf(mdev)) {
774 if (!mdev->net_conf->two_primaries &&
775 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
776 rv = SS_TWO_PRIMARIES;
781 /* already found a reason to abort */;
782 else if (ns.role == R_SECONDARY && mdev->open_cnt)
783 rv = SS_DEVICE_IN_USE;
785 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
786 rv = SS_NO_UP_TO_DATE_DISK;
788 else if (fp >= FP_RESOURCE &&
789 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
792 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
793 rv = SS_NO_UP_TO_DATE_DISK;
795 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
796 rv = SS_NO_LOCAL_DISK;
798 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
799 rv = SS_NO_REMOTE_DISK;
801 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
802 rv = SS_NO_UP_TO_DATE_DISK;
804 else if ((ns.conn == C_CONNECTED ||
805 ns.conn == C_WF_BITMAP_S ||
806 ns.conn == C_SYNC_SOURCE ||
807 ns.conn == C_PAUSED_SYNC_S) &&
808 ns.disk == D_OUTDATED)
809 rv = SS_CONNECTED_OUTDATES;
811 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
812 (mdev->sync_conf.verify_alg[0] == 0))
813 rv = SS_NO_VERIFY_ALG;
815 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
816 mdev->agreed_pro_version < 88)
817 rv = SS_NOT_SUPPORTED;
819 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
820 rv = SS_CONNECTED_OUTDATES;
826 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
827 * @mdev: DRBD device.
831 static enum drbd_state_rv
832 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
835 enum drbd_state_rv rv = SS_SUCCESS;
837 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
838 os.conn > C_CONNECTED)
839 rv = SS_RESYNC_RUNNING;
841 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
842 rv = SS_ALREADY_STANDALONE;
844 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
847 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
848 rv = SS_NO_NET_CONFIG;
850 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
851 rv = SS_LOWER_THAN_OUTDATED;
853 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
854 rv = SS_IN_TRANSIENT_STATE;
856 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
857 rv = SS_IN_TRANSIENT_STATE;
859 /* While establishing a connection only allow cstate to change.
860 Delay/refuse role changes, detach attach etc... */
861 if (test_bit(STATE_SENT, &mdev->flags) &&
862 !(os.conn == C_WF_REPORT_PARAMS ||
863 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
864 rv = SS_IN_TRANSIENT_STATE;
866 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
867 rv = SS_NEED_CONNECTION;
869 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
870 ns.conn != os.conn && os.conn > C_CONNECTED)
871 rv = SS_RESYNC_RUNNING;
873 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
874 os.conn < C_CONNECTED)
875 rv = SS_NEED_CONNECTION;
877 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
878 && os.conn < C_WF_REPORT_PARAMS)
879 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
884 static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
886 static const char *msg_table[] = {
888 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
889 [ABORTED_RESYNC] = "Resync aborted.",
890 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
891 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
892 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
895 if (warn != NO_WARNING)
896 dev_warn(DEV, "%s\n", msg_table[warn]);
900 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
901 * @mdev: DRBD device.
906 * When we loose connection, we have to set the state of the peers disk (pdsk)
907 * to D_UNKNOWN. This rule and many more along those lines are in this function.
909 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
910 union drbd_state ns, enum sanitize_state_warnings *warn)
912 enum drbd_fencing_p fp;
913 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
919 if (get_ldev(mdev)) {
920 fp = mdev->ldev->dc.fencing;
924 /* Disallow Network errors to configure a device's network part */
925 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
926 os.conn <= C_DISCONNECTING)
929 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
930 * If you try to go into some Sync* state, that shall fail (elsewhere). */
931 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
932 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
935 /* we cannot fail (again) if we already detached */
936 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
937 ns.disk = D_DISKLESS;
939 /* After C_DISCONNECTING only C_STANDALONE may follow */
940 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
943 if (ns.conn < C_CONNECTED) {
946 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
950 /* Clear the aftr_isp when becoming unconfigured */
951 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
954 /* Abort resync if a disk fails/detaches */
955 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
956 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
958 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
959 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
960 ns.conn = C_CONNECTED;
963 /* Connection breaks down before we finished "Negotiating" */
964 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
965 get_ldev_if_state(mdev, D_NEGOTIATING)) {
966 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
967 ns.disk = mdev->new_state_tmp.disk;
968 ns.pdsk = mdev->new_state_tmp.pdsk;
971 *warn = CONNECTION_LOST_NEGOTIATING;
972 ns.disk = D_DISKLESS;
978 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
979 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
980 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
981 ns.disk = D_UP_TO_DATE;
982 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
983 ns.pdsk = D_UP_TO_DATE;
986 /* Implications of the connection stat on the disk states */
987 disk_min = D_DISKLESS;
988 disk_max = D_UP_TO_DATE;
989 pdsk_min = D_INCONSISTENT;
990 pdsk_max = D_UNKNOWN;
991 switch ((enum drbd_conns)ns.conn) {
993 case C_PAUSED_SYNC_T:
994 case C_STARTING_SYNC_T:
997 disk_min = D_INCONSISTENT;
998 disk_max = D_OUTDATED;
999 pdsk_min = D_UP_TO_DATE;
1000 pdsk_max = D_UP_TO_DATE;
1004 disk_min = D_UP_TO_DATE;
1005 disk_max = D_UP_TO_DATE;
1006 pdsk_min = D_UP_TO_DATE;
1007 pdsk_max = D_UP_TO_DATE;
1010 disk_min = D_DISKLESS;
1011 disk_max = D_UP_TO_DATE;
1012 pdsk_min = D_DISKLESS;
1013 pdsk_max = D_UP_TO_DATE;
1016 case C_PAUSED_SYNC_S:
1017 case C_STARTING_SYNC_S:
1019 disk_min = D_UP_TO_DATE;
1020 disk_max = D_UP_TO_DATE;
1021 pdsk_min = D_INCONSISTENT;
1022 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1025 disk_min = D_INCONSISTENT;
1026 disk_max = D_INCONSISTENT;
1027 pdsk_min = D_UP_TO_DATE;
1028 pdsk_max = D_UP_TO_DATE;
1031 disk_min = D_UP_TO_DATE;
1032 disk_max = D_UP_TO_DATE;
1033 pdsk_min = D_INCONSISTENT;
1034 pdsk_max = D_INCONSISTENT;
1037 case C_DISCONNECTING:
1041 case C_NETWORK_FAILURE:
1042 case C_PROTOCOL_ERROR:
1044 case C_WF_CONNECTION:
1045 case C_WF_REPORT_PARAMS:
1049 if (ns.disk > disk_max)
1052 if (ns.disk < disk_min) {
1054 *warn = IMPLICITLY_UPGRADED_DISK;
1057 if (ns.pdsk > pdsk_max)
1060 if (ns.pdsk < pdsk_min) {
1062 *warn = IMPLICITLY_UPGRADED_PDSK;
1066 if (fp == FP_STONITH &&
1067 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1068 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
1069 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
1071 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1072 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1073 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
1074 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
1076 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1077 if (ns.conn == C_SYNC_SOURCE)
1078 ns.conn = C_PAUSED_SYNC_S;
1079 if (ns.conn == C_SYNC_TARGET)
1080 ns.conn = C_PAUSED_SYNC_T;
1082 if (ns.conn == C_PAUSED_SYNC_S)
1083 ns.conn = C_SYNC_SOURCE;
1084 if (ns.conn == C_PAUSED_SYNC_T)
1085 ns.conn = C_SYNC_TARGET;
1091 /* helper for __drbd_set_state */
1092 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1094 if (mdev->agreed_pro_version < 90)
1095 mdev->ov_start_sector = 0;
1096 mdev->rs_total = drbd_bm_bits(mdev);
1097 mdev->ov_position = 0;
1098 if (cs == C_VERIFY_T) {
1099 /* starting online verify from an arbitrary position
1100 * does not fit well into the existing protocol.
1101 * on C_VERIFY_T, we initialize ov_left and friends
1102 * implicitly in receive_DataRequest once the
1103 * first P_OV_REQUEST is received */
1104 mdev->ov_start_sector = ~(sector_t)0;
1106 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1107 if (bit >= mdev->rs_total) {
1108 mdev->ov_start_sector =
1109 BM_BIT_TO_SECT(mdev->rs_total - 1);
1112 mdev->rs_total -= bit;
1113 mdev->ov_position = mdev->ov_start_sector;
1115 mdev->ov_left = mdev->rs_total;
1118 static void drbd_resume_al(struct drbd_conf *mdev)
1120 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1121 dev_info(DEV, "Resumed AL updates\n");
1125 * __drbd_set_state() - Set a new DRBD state
1126 * @mdev: DRBD device.
1129 * @done: Optional completion, that will get completed after the after_state_ch() finished
1131 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1134 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1135 enum chg_state_flags flags, struct completion *done)
1137 union drbd_state os;
1138 enum drbd_state_rv rv = SS_SUCCESS;
1139 enum sanitize_state_warnings ssw;
1140 struct after_state_chg_work *ascw;
1144 ns = sanitize_state(mdev, os, ns, &ssw);
1147 return SS_NOTHING_TO_DO;
1149 if (!(flags & CS_HARD)) {
1150 /* pre-state-change checks ; only look at ns */
1151 /* See drbd_state_sw_errors in drbd_strings.c */
1153 rv = is_valid_state(mdev, ns);
1154 if (rv < SS_SUCCESS) {
1155 /* If the old state was illegal as well, then let
1158 if (is_valid_state(mdev, os) == rv)
1159 rv = is_valid_state_transition(mdev, ns, os);
1161 rv = is_valid_state_transition(mdev, ns, os);
1164 if (rv < SS_SUCCESS) {
1165 if (flags & CS_VERBOSE)
1166 print_st_err(mdev, os, ns, rv);
1170 print_sanitize_warnings(mdev, ssw);
1176 if (ns.role != os.role)
1177 pbp += sprintf(pbp, "role( %s -> %s ) ",
1178 drbd_role_str(os.role),
1179 drbd_role_str(ns.role));
1180 if (ns.peer != os.peer)
1181 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1182 drbd_role_str(os.peer),
1183 drbd_role_str(ns.peer));
1184 if (ns.conn != os.conn)
1185 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1186 drbd_conn_str(os.conn),
1187 drbd_conn_str(ns.conn));
1188 if (ns.disk != os.disk)
1189 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1190 drbd_disk_str(os.disk),
1191 drbd_disk_str(ns.disk));
1192 if (ns.pdsk != os.pdsk)
1193 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1194 drbd_disk_str(os.pdsk),
1195 drbd_disk_str(ns.pdsk));
1196 if (is_susp(ns) != is_susp(os))
1197 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1200 if (ns.aftr_isp != os.aftr_isp)
1201 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1204 if (ns.peer_isp != os.peer_isp)
1205 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1208 if (ns.user_isp != os.user_isp)
1209 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1212 dev_info(DEV, "%s\n", pb);
1215 /* solve the race between becoming unconfigured,
1216 * worker doing the cleanup, and
1217 * admin reconfiguring us:
1218 * on (re)configure, first set CONFIG_PENDING,
1219 * then wait for a potentially exiting worker,
1220 * start the worker, and schedule one no_op.
1221 * then proceed with configuration.
1223 if (ns.disk == D_DISKLESS &&
1224 ns.conn == C_STANDALONE &&
1225 ns.role == R_SECONDARY &&
1226 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1227 set_bit(DEVICE_DYING, &mdev->flags);
1229 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1230 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1231 * drbd_ldev_destroy() won't happen before our corresponding
1232 * after_state_ch works run, where we put_ldev again. */
1233 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1234 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1235 atomic_inc(&mdev->local_cnt);
1239 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1240 drbd_print_uuids(mdev, "attached to UUIDs");
1242 wake_up(&mdev->misc_wait);
1243 wake_up(&mdev->state_wait);
1245 /* aborted verify run. log the last position */
1246 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1247 ns.conn < C_CONNECTED) {
1248 mdev->ov_start_sector =
1249 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1250 dev_info(DEV, "Online Verify reached sector %llu\n",
1251 (unsigned long long)mdev->ov_start_sector);
1254 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1255 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1256 dev_info(DEV, "Syncer continues.\n");
1257 mdev->rs_paused += (long)jiffies
1258 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1259 if (ns.conn == C_SYNC_TARGET)
1260 mod_timer(&mdev->resync_timer, jiffies);
1263 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1264 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1265 dev_info(DEV, "Resync suspended\n");
1266 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1269 if (os.conn == C_CONNECTED &&
1270 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1271 unsigned long now = jiffies;
1274 set_ov_position(mdev, ns.conn);
1275 mdev->rs_start = now;
1276 mdev->rs_last_events = 0;
1277 mdev->rs_last_sect_ev = 0;
1278 mdev->ov_last_oos_size = 0;
1279 mdev->ov_last_oos_start = 0;
1281 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1282 mdev->rs_mark_left[i] = mdev->ov_left;
1283 mdev->rs_mark_time[i] = now;
1286 drbd_rs_controller_reset(mdev);
1288 if (ns.conn == C_VERIFY_S) {
1289 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1290 (unsigned long long)mdev->ov_position);
1291 mod_timer(&mdev->resync_timer, jiffies);
1295 if (get_ldev(mdev)) {
1296 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1297 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1298 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1300 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1301 mdf |= MDF_CRASHED_PRIMARY;
1302 if (mdev->state.role == R_PRIMARY ||
1303 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1304 mdf |= MDF_PRIMARY_IND;
1305 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1306 mdf |= MDF_CONNECTED_IND;
1307 if (mdev->state.disk > D_INCONSISTENT)
1308 mdf |= MDF_CONSISTENT;
1309 if (mdev->state.disk > D_OUTDATED)
1310 mdf |= MDF_WAS_UP_TO_DATE;
1311 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1312 mdf |= MDF_PEER_OUT_DATED;
1313 if (mdf != mdev->ldev->md.flags) {
1314 mdev->ldev->md.flags = mdf;
1315 drbd_md_mark_dirty(mdev);
1317 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1318 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1322 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1323 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1324 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1325 set_bit(CONSIDER_RESYNC, &mdev->flags);
1327 /* Receiver should clean up itself */
1328 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1329 drbd_thread_stop_nowait(&mdev->receiver);
1331 /* Now the receiver finished cleaning up itself, it should die */
1332 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1333 drbd_thread_stop_nowait(&mdev->receiver);
1335 /* Upon network failure, we need to restart the receiver. */
1336 if (os.conn > C_WF_CONNECTION &&
1337 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1338 drbd_thread_restart_nowait(&mdev->receiver);
1340 /* Resume AL writing if we get a connection */
1341 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1342 drbd_resume_al(mdev);
1344 /* remember last connect and attach times so request_timer_fn() won't
1345 * kill newly established sessions while we are still trying to thaw
1346 * previously frozen IO */
1347 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1348 mdev->last_reconnect_jif = jiffies;
1349 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1350 ns.disk > D_NEGOTIATING)
1351 mdev->last_reattach_jif = jiffies;
1353 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1357 ascw->flags = flags;
1358 ascw->w.cb = w_after_state_ch;
1360 drbd_queue_work(&mdev->data.work, &ascw->w);
1362 dev_warn(DEV, "Could not kmalloc an ascw\n");
1368 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1370 struct after_state_chg_work *ascw =
1371 container_of(w, struct after_state_chg_work, w);
1372 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1373 if (ascw->flags & CS_WAIT_COMPLETE) {
1374 D_ASSERT(ascw->done != NULL);
1375 complete(ascw->done);
1382 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1385 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1386 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1390 switch (mdev->state.conn) {
1391 case C_STARTING_SYNC_T:
1392 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1394 case C_STARTING_SYNC_S:
1395 drbd_start_resync(mdev, C_SYNC_SOURCE);
1400 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1401 int (*io_fn)(struct drbd_conf *),
1402 char *why, enum bm_flag flags)
1406 D_ASSERT(current == mdev->worker.task);
1408 /* open coded non-blocking drbd_suspend_io(mdev); */
1409 set_bit(SUSPEND_IO, &mdev->flags);
1411 drbd_bm_lock(mdev, why, flags);
1413 drbd_bm_unlock(mdev);
1415 drbd_resume_io(mdev);
1421 * after_state_ch() - Perform after state change actions that may sleep
1422 * @mdev: DRBD device.
1427 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1428 union drbd_state ns, enum chg_state_flags flags)
1430 enum drbd_fencing_p fp;
1431 enum drbd_req_event what = nothing;
1432 union drbd_state nsm = (union drbd_state){ .i = -1 };
1434 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1435 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1437 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1441 if (get_ldev(mdev)) {
1442 fp = mdev->ldev->dc.fencing;
1446 /* Inform userspace about the change... */
1447 drbd_bcast_state(mdev, ns);
1449 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1450 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1451 drbd_khelper(mdev, "pri-on-incon-degr");
1453 /* Here we have the actions that are performed after a
1454 state change. This function might sleep */
1456 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1457 mod_timer(&mdev->request_timer, jiffies + HZ);
1461 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1464 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1465 ns.disk > D_NEGOTIATING)
1466 what = restart_frozen_disk_io;
1468 if (what != nothing)
1473 /* case1: The outdate peer handler is successful: */
1474 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1475 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1476 drbd_uuid_new_current(mdev);
1477 clear_bit(NEW_CUR_UUID, &mdev->flags);
1479 spin_lock_irq(&mdev->req_lock);
1481 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1482 spin_unlock_irq(&mdev->req_lock);
1484 /* case2: The connection was established again: */
1485 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1486 clear_bit(NEW_CUR_UUID, &mdev->flags);
1492 if (what != nothing) {
1493 spin_lock_irq(&mdev->req_lock);
1494 _tl_restart(mdev, what);
1495 nsm.i &= mdev->state.i;
1496 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1497 spin_unlock_irq(&mdev->req_lock);
1500 /* Became sync source. With protocol >= 96, we still need to send out
1501 * the sync uuid now. Need to do that before any drbd_send_state, or
1502 * the other side may go "paused sync" before receiving the sync uuids,
1503 * which is unexpected. */
1504 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1505 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1506 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1507 drbd_gen_and_send_sync_uuid(mdev);
1511 /* Do not change the order of the if above and the two below... */
1512 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1513 /* we probably will start a resync soon.
1514 * make sure those things are properly reset. */
1516 mdev->rs_failed = 0;
1517 atomic_set(&mdev->rs_pending_cnt, 0);
1518 drbd_rs_cancel_all(mdev);
1520 drbd_send_uuids(mdev);
1521 drbd_send_state(mdev, ns);
1523 /* No point in queuing send_bitmap if we don't have a connection
1524 * anymore, so check also the _current_ state, not only the new state
1525 * at the time this work was queued. */
1526 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1527 mdev->state.conn == C_WF_BITMAP_S)
1528 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1529 "send_bitmap (WFBitMapS)",
1530 BM_LOCKED_TEST_ALLOWED);
1532 /* Lost contact to peer's copy of the data */
1533 if ((os.pdsk >= D_INCONSISTENT &&
1534 os.pdsk != D_UNKNOWN &&
1535 os.pdsk != D_OUTDATED)
1536 && (ns.pdsk < D_INCONSISTENT ||
1537 ns.pdsk == D_UNKNOWN ||
1538 ns.pdsk == D_OUTDATED)) {
1539 if (get_ldev(mdev)) {
1540 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1541 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1542 if (is_susp(mdev->state)) {
1543 set_bit(NEW_CUR_UUID, &mdev->flags);
1545 drbd_uuid_new_current(mdev);
1546 drbd_send_uuids(mdev);
1553 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1554 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1555 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1556 drbd_uuid_new_current(mdev);
1557 drbd_send_uuids(mdev);
1559 /* D_DISKLESS Peer becomes secondary */
1560 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1561 /* We may still be Primary ourselves.
1562 * No harm done if the bitmap still changes,
1563 * redirtied pages will follow later. */
1564 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1565 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1569 /* Write out all changed bits on demote.
1570 * Though, no need to da that just yet
1571 * if there is a resync going on still */
1572 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1573 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1574 /* No changes to the bitmap expected this time, so assert that,
1575 * even though no harm was done if it did change. */
1576 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1577 "demote", BM_LOCKED_TEST_ALLOWED);
1581 /* Last part of the attaching process ... */
1582 if (ns.conn >= C_CONNECTED &&
1583 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1584 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1585 drbd_send_uuids(mdev);
1586 drbd_send_state(mdev, ns);
1589 /* We want to pause/continue resync, tell peer. */
1590 if (ns.conn >= C_CONNECTED &&
1591 ((os.aftr_isp != ns.aftr_isp) ||
1592 (os.user_isp != ns.user_isp)))
1593 drbd_send_state(mdev, ns);
1595 /* In case one of the isp bits got set, suspend other devices. */
1596 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1597 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1598 suspend_other_sg(mdev);
1600 /* Make sure the peer gets informed about eventual state
1601 changes (ISP bits) while we were in WFReportParams. */
1602 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1603 drbd_send_state(mdev, ns);
1605 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1606 drbd_send_state(mdev, ns);
1608 /* We are in the progress to start a full sync... */
1609 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1610 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1611 /* no other bitmap changes expected during this phase */
1612 drbd_queue_bitmap_io(mdev,
1613 &drbd_bmio_set_n_write, &abw_start_sync,
1614 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1616 /* We are invalidating our self... */
1617 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1618 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1619 /* other bitmap operation expected during this phase */
1620 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1621 "set_n_write from invalidate", BM_LOCKED_MASK);
1623 /* first half of local IO error, failure to attach,
1624 * or administrative detach */
1625 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1626 enum drbd_io_error_p eh = EP_PASS_ON;
1627 int was_io_error = 0;
1628 /* corresponding get_ldev was in __drbd_set_state, to serialize
1629 * our cleanup here with the transition to D_DISKLESS.
1630 * But is is still not save to dreference ldev here, since
1631 * we might come from an failed Attach before ldev was set. */
1633 eh = mdev->ldev->dc.on_io_error;
1634 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1636 if (was_io_error && eh == EP_CALL_HELPER)
1637 drbd_khelper(mdev, "local-io-error");
1639 /* Immediately allow completion of all application IO,
1640 * that waits for completion from the local disk,
1641 * if this was a force-detach due to disk_timeout
1642 * or administrator request (drbdsetup detach --force).
1643 * Do NOT abort otherwise.
1644 * Aborting local requests may cause serious problems,
1645 * if requests are completed to upper layers already,
1646 * and then later the already submitted local bio completes.
1647 * This can cause DMA into former bio pages that meanwhile
1648 * have been re-used for other things.
1649 * So aborting local requests may cause crashes,
1650 * or even worse, silent data corruption.
1652 if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
1653 tl_abort_disk_io(mdev);
1655 /* current state still has to be D_FAILED,
1656 * there is only one way out: to D_DISKLESS,
1657 * and that may only happen after our put_ldev below. */
1658 if (mdev->state.disk != D_FAILED)
1660 "ASSERT FAILED: disk is %s during detach\n",
1661 drbd_disk_str(mdev->state.disk));
1663 if (ns.conn >= C_CONNECTED)
1664 drbd_send_state(mdev, ns);
1666 drbd_rs_cancel_all(mdev);
1668 /* In case we want to get something to stable storage still,
1669 * this may be the last chance.
1670 * Following put_ldev may transition to D_DISKLESS. */
1676 /* second half of local IO error, failure to attach,
1677 * or administrative detach,
1678 * after local_cnt references have reached zero again */
1679 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1680 /* We must still be diskless,
1681 * re-attach has to be serialized with this! */
1682 if (mdev->state.disk != D_DISKLESS)
1684 "ASSERT FAILED: disk is %s while going diskless\n",
1685 drbd_disk_str(mdev->state.disk));
1687 if (ns.conn >= C_CONNECTED)
1688 drbd_send_state(mdev, ns);
1690 /* corresponding get_ldev in __drbd_set_state
1691 * this may finally trigger drbd_ldev_destroy. */
1695 /* Notify peer that I had a local IO error, and did not detached.. */
1696 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1697 drbd_send_state(mdev, ns);
1699 /* Disks got bigger while they were detached */
1700 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1701 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1702 if (ns.conn == C_CONNECTED)
1703 resync_after_online_grow(mdev);
1706 /* A resync finished or aborted, wake paused devices... */
1707 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1708 (os.peer_isp && !ns.peer_isp) ||
1709 (os.user_isp && !ns.user_isp))
1710 resume_next_sg(mdev);
1712 /* sync target done with resync. Explicitly notify peer, even though
1713 * it should (at least for non-empty resyncs) already know itself. */
1714 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1715 drbd_send_state(mdev, ns);
1717 /* Wake up role changes, that were delayed because of connection establishing */
1718 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1719 clear_bit(STATE_SENT, &mdev->flags);
1720 wake_up(&mdev->state_wait);
1723 /* This triggers bitmap writeout of potentially still unwritten pages
1724 * if the resync finished cleanly, or aborted because of peer disk
1725 * failure, or because of connection loss.
1726 * For resync aborted because of local disk failure, we cannot do
1727 * any bitmap writeout anymore.
1728 * No harm done if some bits change during this phase.
1730 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1731 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1732 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1736 /* free tl_hash if we Got thawed and are C_STANDALONE */
1737 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1738 drbd_free_tl_hash(mdev);
1740 /* Upon network connection, we need to start the receiver */
1741 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1742 drbd_thread_start(&mdev->receiver);
1744 /* Terminate worker thread if we are unconfigured - it will be
1745 restarted as needed... */
1746 if (ns.disk == D_DISKLESS &&
1747 ns.conn == C_STANDALONE &&
1748 ns.role == R_SECONDARY) {
1749 if (os.aftr_isp != ns.aftr_isp)
1750 resume_next_sg(mdev);
1751 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1752 if (test_bit(DEVICE_DYING, &mdev->flags))
1753 drbd_thread_stop_nowait(&mdev->worker);
1760 static int drbd_thread_setup(void *arg)
1762 struct drbd_thread *thi = (struct drbd_thread *) arg;
1763 struct drbd_conf *mdev = thi->mdev;
1764 unsigned long flags;
1768 retval = thi->function(thi);
1770 spin_lock_irqsave(&thi->t_lock, flags);
1772 /* if the receiver has been "Exiting", the last thing it did
1773 * was set the conn state to "StandAlone",
1774 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1775 * and receiver thread will be "started".
1776 * drbd_thread_start needs to set "Restarting" in that case.
1777 * t_state check and assignment needs to be within the same spinlock,
1778 * so either thread_start sees Exiting, and can remap to Restarting,
1779 * or thread_start see None, and can proceed as normal.
1782 if (thi->t_state == Restarting) {
1783 dev_info(DEV, "Restarting %s\n", current->comm);
1784 thi->t_state = Running;
1785 spin_unlock_irqrestore(&thi->t_lock, flags);
1790 thi->t_state = None;
1792 complete(&thi->stop);
1793 spin_unlock_irqrestore(&thi->t_lock, flags);
1795 dev_info(DEV, "Terminating %s\n", current->comm);
1797 /* Release mod reference taken when thread was started */
1798 module_put(THIS_MODULE);
1802 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1803 int (*func) (struct drbd_thread *))
1805 spin_lock_init(&thi->t_lock);
1807 thi->t_state = None;
1808 thi->function = func;
1812 int drbd_thread_start(struct drbd_thread *thi)
1814 struct drbd_conf *mdev = thi->mdev;
1815 struct task_struct *nt;
1816 unsigned long flags;
1819 thi == &mdev->receiver ? "receiver" :
1820 thi == &mdev->asender ? "asender" :
1821 thi == &mdev->worker ? "worker" : "NONSENSE";
1823 /* is used from state engine doing drbd_thread_stop_nowait,
1824 * while holding the req lock irqsave */
1825 spin_lock_irqsave(&thi->t_lock, flags);
1827 switch (thi->t_state) {
1829 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1830 me, current->comm, current->pid);
1832 /* Get ref on module for thread - this is released when thread exits */
1833 if (!try_module_get(THIS_MODULE)) {
1834 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1835 spin_unlock_irqrestore(&thi->t_lock, flags);
1839 init_completion(&thi->stop);
1840 D_ASSERT(thi->task == NULL);
1841 thi->reset_cpu_mask = 1;
1842 thi->t_state = Running;
1843 spin_unlock_irqrestore(&thi->t_lock, flags);
1844 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1846 nt = kthread_create(drbd_thread_setup, (void *) thi,
1847 "drbd%d_%s", mdev_to_minor(mdev), me);
1850 dev_err(DEV, "Couldn't start thread\n");
1852 module_put(THIS_MODULE);
1855 spin_lock_irqsave(&thi->t_lock, flags);
1857 thi->t_state = Running;
1858 spin_unlock_irqrestore(&thi->t_lock, flags);
1859 wake_up_process(nt);
1862 thi->t_state = Restarting;
1863 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1864 me, current->comm, current->pid);
1869 spin_unlock_irqrestore(&thi->t_lock, flags);
1877 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1879 unsigned long flags;
1881 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1883 /* may be called from state engine, holding the req lock irqsave */
1884 spin_lock_irqsave(&thi->t_lock, flags);
1886 if (thi->t_state == None) {
1887 spin_unlock_irqrestore(&thi->t_lock, flags);
1889 drbd_thread_start(thi);
1893 if (thi->t_state != ns) {
1894 if (thi->task == NULL) {
1895 spin_unlock_irqrestore(&thi->t_lock, flags);
1901 init_completion(&thi->stop);
1902 if (thi->task != current)
1903 force_sig(DRBD_SIGKILL, thi->task);
1907 spin_unlock_irqrestore(&thi->t_lock, flags);
1910 wait_for_completion(&thi->stop);
1915 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1916 * @mdev: DRBD device.
1918 * Forces all threads of a device onto the same CPU. This is beneficial for
1919 * DRBD's performance. May be overwritten by user's configuration.
1921 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1925 /* user override. */
1926 if (cpumask_weight(mdev->cpu_mask))
1929 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1930 for_each_online_cpu(cpu) {
1932 cpumask_set_cpu(cpu, mdev->cpu_mask);
1936 /* should not be reached */
1937 cpumask_setall(mdev->cpu_mask);
1941 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1942 * @mdev: DRBD device.
1944 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1947 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1949 struct task_struct *p = current;
1950 struct drbd_thread *thi =
1951 p == mdev->asender.task ? &mdev->asender :
1952 p == mdev->receiver.task ? &mdev->receiver :
1953 p == mdev->worker.task ? &mdev->worker :
1957 if (!thi->reset_cpu_mask)
1959 thi->reset_cpu_mask = 0;
1960 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1964 /* the appropriate socket mutex must be held already */
1965 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1966 enum drbd_packets cmd, struct p_header80 *h,
1967 size_t size, unsigned msg_flags)
1971 ERR_IF(!h) return false;
1972 ERR_IF(!size) return false;
1974 h->magic = BE_DRBD_MAGIC;
1975 h->command = cpu_to_be16(cmd);
1976 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1978 sent = drbd_send(mdev, sock, h, size, msg_flags);
1980 ok = (sent == size);
1981 if (!ok && !signal_pending(current))
1982 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1983 cmdname(cmd), (int)size, sent);
1987 /* don't pass the socket. we may only look at it
1988 * when we hold the appropriate socket mutex.
1990 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1991 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1994 struct socket *sock;
1996 if (use_data_socket) {
1997 mutex_lock(&mdev->data.mutex);
1998 sock = mdev->data.socket;
2000 mutex_lock(&mdev->meta.mutex);
2001 sock = mdev->meta.socket;
2004 /* drbd_disconnect() could have called drbd_free_sock()
2005 * while we were waiting in down()... */
2006 if (likely(sock != NULL))
2007 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
2009 if (use_data_socket)
2010 mutex_unlock(&mdev->data.mutex);
2012 mutex_unlock(&mdev->meta.mutex);
2016 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2019 struct p_header80 h;
2022 h.magic = BE_DRBD_MAGIC;
2023 h.command = cpu_to_be16(cmd);
2024 h.length = cpu_to_be16(size);
2026 if (!drbd_get_data_sock(mdev))
2030 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2032 drbd_send(mdev, mdev->data.socket, data, size, 0));
2034 drbd_put_data_sock(mdev);
2039 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2041 struct p_rs_param_95 *p;
2042 struct socket *sock;
2044 const int apv = mdev->agreed_pro_version;
2046 size = apv <= 87 ? sizeof(struct p_rs_param)
2047 : apv == 88 ? sizeof(struct p_rs_param)
2048 + strlen(mdev->sync_conf.verify_alg) + 1
2049 : apv <= 94 ? sizeof(struct p_rs_param_89)
2050 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2052 /* used from admin command context and receiver/worker context.
2053 * to avoid kmalloc, grab the socket right here,
2054 * then use the pre-allocated sbuf there */
2055 mutex_lock(&mdev->data.mutex);
2056 sock = mdev->data.socket;
2058 if (likely(sock != NULL)) {
2059 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2061 p = &mdev->data.sbuf.rs_param_95;
2063 /* initialize verify_alg and csums_alg */
2064 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2066 p->rate = cpu_to_be32(sc->rate);
2067 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2068 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2069 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2070 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
2073 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2075 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2077 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2079 rv = 0; /* not ok */
2081 mutex_unlock(&mdev->data.mutex);
2086 int drbd_send_protocol(struct drbd_conf *mdev)
2088 struct p_protocol *p;
2091 size = sizeof(struct p_protocol);
2093 if (mdev->agreed_pro_version >= 87)
2094 size += strlen(mdev->net_conf->integrity_alg) + 1;
2096 /* we must not recurse into our own queue,
2097 * as that is blocked during handshake */
2098 p = kmalloc(size, GFP_NOIO);
2102 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2103 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2104 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2105 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
2106 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2109 if (mdev->net_conf->want_lose)
2111 if (mdev->net_conf->dry_run) {
2112 if (mdev->agreed_pro_version >= 92)
2115 dev_err(DEV, "--dry-run is not supported by peer");
2120 p->conn_flags = cpu_to_be32(cf);
2122 if (mdev->agreed_pro_version >= 87)
2123 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2125 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
2126 (struct p_header80 *)p, size);
2131 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2136 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2139 for (i = UI_CURRENT; i < UI_SIZE; i++)
2140 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2142 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2143 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2144 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2145 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2146 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2147 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2151 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2152 (struct p_header80 *)&p, sizeof(p));
2155 int drbd_send_uuids(struct drbd_conf *mdev)
2157 return _drbd_send_uuids(mdev, 0);
2160 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2162 return _drbd_send_uuids(mdev, 8);
2165 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2167 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2168 u64 *uuid = mdev->ldev->md.uuid;
2169 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2171 (unsigned long long)uuid[UI_CURRENT],
2172 (unsigned long long)uuid[UI_BITMAP],
2173 (unsigned long long)uuid[UI_HISTORY_START],
2174 (unsigned long long)uuid[UI_HISTORY_END]);
2177 dev_info(DEV, "%s effective data uuid: %016llX\n",
2179 (unsigned long long)mdev->ed_uuid);
2183 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2188 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2190 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2191 if (uuid && uuid != UUID_JUST_CREATED)
2192 uuid = uuid + UUID_NEW_BM_OFFSET;
2194 get_random_bytes(&uuid, sizeof(u64));
2195 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2196 drbd_print_uuids(mdev, "updated sync UUID");
2198 p.uuid = cpu_to_be64(uuid);
2200 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2201 (struct p_header80 *)&p, sizeof(p));
2204 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2207 sector_t d_size, u_size;
2209 unsigned int max_bio_size;
2212 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2213 D_ASSERT(mdev->ldev->backing_bdev);
2214 d_size = drbd_get_max_capacity(mdev->ldev);
2215 u_size = mdev->ldev->dc.disk_size;
2216 q_order_type = drbd_queue_order_type(mdev);
2217 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2218 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
2223 q_order_type = QUEUE_ORDERED_NONE;
2224 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2227 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2228 if (mdev->agreed_pro_version <= 94)
2229 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2231 p.d_size = cpu_to_be64(d_size);
2232 p.u_size = cpu_to_be64(u_size);
2233 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2234 p.max_bio_size = cpu_to_be32(max_bio_size);
2235 p.queue_order_type = cpu_to_be16(q_order_type);
2236 p.dds_flags = cpu_to_be16(flags);
2238 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2239 (struct p_header80 *)&p, sizeof(p));
2244 * drbd_send_current_state() - Sends the drbd state to the peer
2245 * @mdev: DRBD device.
2247 int drbd_send_current_state(struct drbd_conf *mdev)
2249 struct socket *sock;
2253 /* Grab state lock so we wont send state if we're in the middle
2254 * of a cluster wide state change on another thread */
2255 drbd_state_lock(mdev);
2257 mutex_lock(&mdev->data.mutex);
2259 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2260 sock = mdev->data.socket;
2262 if (likely(sock != NULL)) {
2263 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2264 (struct p_header80 *)&p, sizeof(p), 0);
2267 mutex_unlock(&mdev->data.mutex);
2269 drbd_state_unlock(mdev);
2274 * drbd_send_state() - After a state change, sends the new state to the peer
2275 * @mdev: DRBD device.
2276 * @state: the state to send, not necessarily the current state.
2278 * Each state change queues an "after_state_ch" work, which will eventually
2279 * send the resulting new state to the peer. If more state changes happen
2280 * between queuing and processing of the after_state_ch work, we still
2281 * want to send each intermediary state in the order it occurred.
2283 int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2285 struct socket *sock;
2289 mutex_lock(&mdev->data.mutex);
2291 p.state = cpu_to_be32(state.i);
2292 sock = mdev->data.socket;
2294 if (likely(sock != NULL)) {
2295 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2296 (struct p_header80 *)&p, sizeof(p), 0);
2299 mutex_unlock(&mdev->data.mutex);
2304 int drbd_send_state_req(struct drbd_conf *mdev,
2305 union drbd_state mask, union drbd_state val)
2307 struct p_req_state p;
2309 p.mask = cpu_to_be32(mask.i);
2310 p.val = cpu_to_be32(val.i);
2312 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2313 (struct p_header80 *)&p, sizeof(p));
2316 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2318 struct p_req_state_reply p;
2320 p.retcode = cpu_to_be32(retcode);
2322 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2323 (struct p_header80 *)&p, sizeof(p));
2326 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2327 struct p_compressed_bm *p,
2328 struct bm_xfer_ctx *c)
2330 struct bitstream bs;
2331 unsigned long plain_bits;
2338 /* may we use this feature? */
2339 if ((mdev->sync_conf.use_rle == 0) ||
2340 (mdev->agreed_pro_version < 90))
2343 if (c->bit_offset >= c->bm_bits)
2344 return 0; /* nothing to do. */
2346 /* use at most thus many bytes */
2347 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2348 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2349 /* plain bits covered in this code string */
2352 /* p->encoding & 0x80 stores whether the first run length is set.
2353 * bit offset is implicit.
2354 * start with toggle == 2 to be able to tell the first iteration */
2357 /* see how much plain bits we can stuff into one packet
2358 * using RLE and VLI. */
2360 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2361 : _drbd_bm_find_next(mdev, c->bit_offset);
2364 rl = tmp - c->bit_offset;
2366 if (toggle == 2) { /* first iteration */
2368 /* the first checked bit was set,
2369 * store start value, */
2370 DCBP_set_start(p, 1);
2371 /* but skip encoding of zero run length */
2375 DCBP_set_start(p, 0);
2378 /* paranoia: catch zero runlength.
2379 * can only happen if bitmap is modified while we scan it. */
2381 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2382 "t:%u bo:%lu\n", toggle, c->bit_offset);
2386 bits = vli_encode_bits(&bs, rl);
2387 if (bits == -ENOBUFS) /* buffer full */
2390 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2396 c->bit_offset = tmp;
2397 } while (c->bit_offset < c->bm_bits);
2399 len = bs.cur.b - p->code + !!bs.cur.bit;
2401 if (plain_bits < (len << 3)) {
2402 /* incompressible with this method.
2403 * we need to rewind both word and bit position. */
2404 c->bit_offset -= plain_bits;
2405 bm_xfer_ctx_bit_to_word_offset(c);
2406 c->bit_offset = c->word_offset * BITS_PER_LONG;
2410 /* RLE + VLI was able to compress it just fine.
2411 * update c->word_offset. */
2412 bm_xfer_ctx_bit_to_word_offset(c);
2414 /* store pad_bits */
2415 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2421 * send_bitmap_rle_or_plain
2423 * Return 0 when done, 1 when another iteration is needed, and a negative error
2424 * code upon failure.
2427 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2428 struct p_header80 *h, struct bm_xfer_ctx *c)
2430 struct p_compressed_bm *p = (void*)h;
2431 unsigned long num_words;
2435 len = fill_bitmap_rle_bits(mdev, p, c);
2441 DCBP_set_code(p, RLE_VLI_Bits);
2442 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2443 sizeof(*p) + len, 0);
2446 c->bytes[0] += sizeof(*p) + len;
2448 if (c->bit_offset >= c->bm_bits)
2451 /* was not compressible.
2452 * send a buffer full of plain text bits instead. */
2453 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2454 len = num_words * sizeof(long);
2456 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2457 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2458 h, sizeof(struct p_header80) + len, 0);
2459 c->word_offset += num_words;
2460 c->bit_offset = c->word_offset * BITS_PER_LONG;
2463 c->bytes[1] += sizeof(struct p_header80) + len;
2465 if (c->bit_offset > c->bm_bits)
2466 c->bit_offset = c->bm_bits;
2470 INFO_bm_xfer_stats(mdev, "send", c);
2478 /* See the comment at receive_bitmap() */
2479 int _drbd_send_bitmap(struct drbd_conf *mdev)
2481 struct bm_xfer_ctx c;
2482 struct p_header80 *p;
2485 ERR_IF(!mdev->bitmap) return false;
2487 /* maybe we should use some per thread scratch page,
2488 * and allocate that during initial device creation? */
2489 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2491 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2495 if (get_ldev(mdev)) {
2496 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2497 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2498 drbd_bm_set_all(mdev);
2499 if (drbd_bm_write(mdev)) {
2500 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2501 * but otherwise process as per normal - need to tell other
2502 * side that a full resync is required! */
2503 dev_err(DEV, "Failed to write bitmap to disk!\n");
2505 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2512 c = (struct bm_xfer_ctx) {
2513 .bm_bits = drbd_bm_bits(mdev),
2514 .bm_words = drbd_bm_words(mdev),
2518 err = send_bitmap_rle_or_plain(mdev, p, &c);
2521 free_page((unsigned long) p);
2525 int drbd_send_bitmap(struct drbd_conf *mdev)
2529 if (!drbd_get_data_sock(mdev))
2531 err = !_drbd_send_bitmap(mdev);
2532 drbd_put_data_sock(mdev);
2536 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2539 struct p_barrier_ack p;
2541 p.barrier = barrier_nr;
2542 p.set_size = cpu_to_be32(set_size);
2544 if (mdev->state.conn < C_CONNECTED)
2546 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2547 (struct p_header80 *)&p, sizeof(p));
2552 * _drbd_send_ack() - Sends an ack packet
2553 * @mdev: DRBD device.
2554 * @cmd: Packet command code.
2555 * @sector: sector, needs to be in big endian byte order
2556 * @blksize: size in byte, needs to be in big endian byte order
2557 * @block_id: Id, big endian byte order
2559 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2565 struct p_block_ack p;
2568 p.block_id = block_id;
2569 p.blksize = blksize;
2570 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2572 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2574 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2575 (struct p_header80 *)&p, sizeof(p));
2579 /* dp->sector and dp->block_id already/still in network byte order,
2580 * data_size is payload size according to dp->head,
2581 * and may need to be corrected for digest size. */
2582 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2583 struct p_data *dp, int data_size)
2585 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2586 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2587 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2591 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2592 struct p_block_req *rp)
2594 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2598 * drbd_send_ack() - Sends an ack packet
2599 * @mdev: DRBD device.
2600 * @cmd: Packet command code.
2603 int drbd_send_ack(struct drbd_conf *mdev,
2604 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2606 return _drbd_send_ack(mdev, cmd,
2607 cpu_to_be64(e->sector),
2608 cpu_to_be32(e->size),
2612 /* This function misuses the block_id field to signal if the blocks
2613 * are is sync or not. */
2614 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2615 sector_t sector, int blksize, u64 block_id)
2617 return _drbd_send_ack(mdev, cmd,
2618 cpu_to_be64(sector),
2619 cpu_to_be32(blksize),
2620 cpu_to_be64(block_id));
2623 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2624 sector_t sector, int size, u64 block_id)
2627 struct p_block_req p;
2629 p.sector = cpu_to_be64(sector);
2630 p.block_id = block_id;
2631 p.blksize = cpu_to_be32(size);
2633 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2634 (struct p_header80 *)&p, sizeof(p));
2638 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2639 sector_t sector, int size,
2640 void *digest, int digest_size,
2641 enum drbd_packets cmd)
2644 struct p_block_req p;
2646 p.sector = cpu_to_be64(sector);
2647 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2648 p.blksize = cpu_to_be32(size);
2650 p.head.magic = BE_DRBD_MAGIC;
2651 p.head.command = cpu_to_be16(cmd);
2652 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2654 mutex_lock(&mdev->data.mutex);
2656 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2657 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2659 mutex_unlock(&mdev->data.mutex);
2664 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2667 struct p_block_req p;
2669 p.sector = cpu_to_be64(sector);
2670 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2671 p.blksize = cpu_to_be32(size);
2673 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2674 (struct p_header80 *)&p, sizeof(p));
2678 /* called on sndtimeo
2679 * returns false if we should retry,
2680 * true if we think connection is dead
2682 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2685 /* long elapsed = (long)(jiffies - mdev->last_received); */
2687 drop_it = mdev->meta.socket == sock
2688 || !mdev->asender.task
2689 || get_t_state(&mdev->asender) != Running
2690 || mdev->state.conn < C_CONNECTED;
2695 drop_it = !--mdev->ko_count;
2697 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2698 current->comm, current->pid, mdev->ko_count);
2702 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2705 /* The idea of sendpage seems to be to put some kind of reference
2706 * to the page into the skb, and to hand it over to the NIC. In
2707 * this process get_page() gets called.
2709 * As soon as the page was really sent over the network put_page()
2710 * gets called by some part of the network layer. [ NIC driver? ]
2712 * [ get_page() / put_page() increment/decrement the count. If count
2713 * reaches 0 the page will be freed. ]
2715 * This works nicely with pages from FSs.
2716 * But this means that in protocol A we might signal IO completion too early!
2718 * In order not to corrupt data during a resync we must make sure
2719 * that we do not reuse our own buffer pages (EEs) to early, therefore
2720 * we have the net_ee list.
2722 * XFS seems to have problems, still, it submits pages with page_count == 0!
2723 * As a workaround, we disable sendpage on pages
2724 * with page_count == 0 or PageSlab.
2726 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2727 int offset, size_t size, unsigned msg_flags)
2729 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2732 mdev->send_cnt += size>>9;
2733 return sent == size;
2736 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2737 int offset, size_t size, unsigned msg_flags)
2739 mm_segment_t oldfs = get_fs();
2743 /* e.g. XFS meta- & log-data is in slab pages, which have a
2744 * page_count of 0 and/or have PageSlab() set.
2745 * we cannot use send_page for those, as that does get_page();
2746 * put_page(); and would cause either a VM_BUG directly, or
2747 * __page_cache_release a page that would actually still be referenced
2748 * by someone, leading to some obscure delayed Oops somewhere else. */
2749 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2750 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2752 msg_flags |= MSG_NOSIGNAL;
2753 drbd_update_congested(mdev);
2756 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2759 if (sent == -EAGAIN) {
2760 if (we_should_drop_the_connection(mdev,
2767 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2768 __func__, (int)size, len, sent);
2773 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2775 clear_bit(NET_CONGESTED, &mdev->flags);
2779 mdev->send_cnt += size>>9;
2783 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2785 struct bio_vec *bvec;
2787 /* hint all but last page with MSG_MORE */
2788 bio_for_each_segment(bvec, bio, i) {
2789 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2790 bvec->bv_offset, bvec->bv_len,
2791 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2797 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2799 struct bio_vec *bvec;
2801 /* hint all but last page with MSG_MORE */
2802 bio_for_each_segment(bvec, bio, i) {
2803 if (!_drbd_send_page(mdev, bvec->bv_page,
2804 bvec->bv_offset, bvec->bv_len,
2805 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2811 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2813 struct page *page = e->pages;
2814 unsigned len = e->size;
2815 /* hint all but last page with MSG_MORE */
2816 page_chain_for_each(page) {
2817 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2818 if (!_drbd_send_page(mdev, page, 0, l,
2819 page_chain_next(page) ? MSG_MORE : 0))
2826 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2828 if (mdev->agreed_pro_version >= 95)
2829 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2830 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2831 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2832 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2834 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2837 /* Used to send write requests
2838 * R_PRIMARY -> Peer (P_DATA)
2840 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2844 unsigned int dp_flags = 0;
2848 if (!drbd_get_data_sock(mdev))
2851 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2852 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2854 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2855 p.head.h80.magic = BE_DRBD_MAGIC;
2856 p.head.h80.command = cpu_to_be16(P_DATA);
2858 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2860 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2861 p.head.h95.command = cpu_to_be16(P_DATA);
2863 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2866 p.sector = cpu_to_be64(req->sector);
2867 p.block_id = (unsigned long)req;
2868 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2870 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2872 if (mdev->state.conn >= C_SYNC_SOURCE &&
2873 mdev->state.conn <= C_PAUSED_SYNC_T)
2874 dp_flags |= DP_MAY_SET_IN_SYNC;
2876 p.dp_flags = cpu_to_be32(dp_flags);
2877 set_bit(UNPLUG_REMOTE, &mdev->flags);
2879 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2881 dgb = mdev->int_dig_out;
2882 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2883 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2886 /* For protocol A, we have to memcpy the payload into
2887 * socket buffers, as we may complete right away
2888 * as soon as we handed it over to tcp, at which point the data
2889 * pages may become invalid.
2891 * For data-integrity enabled, we copy it as well, so we can be
2892 * sure that even if the bio pages may still be modified, it
2893 * won't change the data on the wire, thus if the digest checks
2894 * out ok after sending on this side, but does not fit on the
2895 * receiving side, we sure have detected corruption elsewhere.
2897 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2898 ok = _drbd_send_bio(mdev, req->master_bio);
2900 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2902 /* double check digest, sometimes buffers have been modified in flight. */
2903 if (dgs > 0 && dgs <= 64) {
2904 /* 64 byte, 512 bit, is the largest digest size
2905 * currently supported in kernel crypto. */
2906 unsigned char digest[64];
2907 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2908 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2910 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2911 (unsigned long long)req->sector, req->size);
2913 } /* else if (dgs > 64) {
2914 ... Be noisy about digest too large ...
2918 drbd_put_data_sock(mdev);
2923 /* answer packet, used to send data back for read requests:
2924 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2925 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2927 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2928 struct drbd_epoch_entry *e)
2935 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2936 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2938 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2939 p.head.h80.magic = BE_DRBD_MAGIC;
2940 p.head.h80.command = cpu_to_be16(cmd);
2942 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2944 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2945 p.head.h95.command = cpu_to_be16(cmd);
2947 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2950 p.sector = cpu_to_be64(e->sector);
2951 p.block_id = e->block_id;
2952 /* p.seq_num = 0; No sequence numbers here.. */
2954 /* Only called by our kernel thread.
2955 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2956 * in response to admin command or module unload.
2958 if (!drbd_get_data_sock(mdev))
2961 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2963 dgb = mdev->int_dig_out;
2964 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2965 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2968 ok = _drbd_send_zc_ee(mdev, e);
2970 drbd_put_data_sock(mdev);
2975 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2977 struct p_block_desc p;
2979 p.sector = cpu_to_be64(req->sector);
2980 p.blksize = cpu_to_be32(req->size);
2982 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2986 drbd_send distinguishes two cases:
2988 Packets sent via the data socket "sock"
2989 and packets sent via the meta data socket "msock"
2992 -----------------+-------------------------+------------------------------
2993 timeout conf.timeout / 2 conf.timeout / 2
2994 timeout action send a ping via msock Abort communication
2995 and close all sockets
2999 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
3001 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
3002 void *buf, size_t size, unsigned msg_flags)
3011 /* THINK if (signal_pending) return ... ? */
3016 msg.msg_name = NULL;
3017 msg.msg_namelen = 0;
3018 msg.msg_control = NULL;
3019 msg.msg_controllen = 0;
3020 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3022 if (sock == mdev->data.socket) {
3023 mdev->ko_count = mdev->net_conf->ko_count;
3024 drbd_update_congested(mdev);
3028 * tcp_sendmsg does _not_ use its size parameter at all ?
3030 * -EAGAIN on timeout, -EINTR on signal.
3033 * do we need to block DRBD_SIG if sock == &meta.socket ??
3034 * otherwise wake_asender() might interrupt some send_*Ack !
3036 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3037 if (rv == -EAGAIN) {
3038 if (we_should_drop_the_connection(mdev, sock))
3045 flush_signals(current);
3053 } while (sent < size);
3055 if (sock == mdev->data.socket)
3056 clear_bit(NET_CONGESTED, &mdev->flags);
3059 if (rv != -EAGAIN) {
3060 dev_err(DEV, "%s_sendmsg returned %d\n",
3061 sock == mdev->meta.socket ? "msock" : "sock",
3063 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3065 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3071 static int drbd_open(struct block_device *bdev, fmode_t mode)
3073 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3074 unsigned long flags;
3077 mutex_lock(&drbd_main_mutex);
3078 spin_lock_irqsave(&mdev->req_lock, flags);
3079 /* to have a stable mdev->state.role
3080 * and no race with updating open_cnt */
3082 if (mdev->state.role != R_PRIMARY) {
3083 if (mode & FMODE_WRITE)
3085 else if (!allow_oos)
3091 spin_unlock_irqrestore(&mdev->req_lock, flags);
3092 mutex_unlock(&drbd_main_mutex);
3097 static int drbd_release(struct gendisk *gd, fmode_t mode)
3099 struct drbd_conf *mdev = gd->private_data;
3100 mutex_lock(&drbd_main_mutex);
3102 mutex_unlock(&drbd_main_mutex);
3106 static void drbd_set_defaults(struct drbd_conf *mdev)
3108 /* This way we get a compile error when sync_conf grows,
3109 and we forgot to initialize it here */
3110 mdev->sync_conf = (struct syncer_conf) {
3111 /* .rate = */ DRBD_RATE_DEF,
3112 /* .after = */ DRBD_AFTER_DEF,
3113 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
3114 /* .verify_alg = */ {}, 0,
3115 /* .cpu_mask = */ {}, 0,
3116 /* .csums_alg = */ {}, 0,
3118 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3119 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3120 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3121 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
3122 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3123 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
3126 /* Have to use that way, because the layout differs between
3127 big endian and little endian */
3128 mdev->state = (union drbd_state) {
3129 { .role = R_SECONDARY,
3131 .conn = C_STANDALONE,
3140 void drbd_init_set_defaults(struct drbd_conf *mdev)
3142 /* the memset(,0,) did most of this.
3143 * note: only assignments, no allocation in here */
3145 drbd_set_defaults(mdev);
3147 atomic_set(&mdev->ap_bio_cnt, 0);
3148 atomic_set(&mdev->ap_pending_cnt, 0);
3149 atomic_set(&mdev->rs_pending_cnt, 0);
3150 atomic_set(&mdev->unacked_cnt, 0);
3151 atomic_set(&mdev->local_cnt, 0);
3152 atomic_set(&mdev->net_cnt, 0);
3153 atomic_set(&mdev->packet_seq, 0);
3154 atomic_set(&mdev->pp_in_use, 0);
3155 atomic_set(&mdev->pp_in_use_by_net, 0);
3156 atomic_set(&mdev->rs_sect_in, 0);
3157 atomic_set(&mdev->rs_sect_ev, 0);
3158 atomic_set(&mdev->ap_in_flight, 0);
3159 atomic_set(&mdev->md_io_in_use, 0);
3161 mutex_init(&mdev->data.mutex);
3162 mutex_init(&mdev->meta.mutex);
3163 sema_init(&mdev->data.work.s, 0);
3164 sema_init(&mdev->meta.work.s, 0);
3165 mutex_init(&mdev->state_mutex);
3167 spin_lock_init(&mdev->data.work.q_lock);
3168 spin_lock_init(&mdev->meta.work.q_lock);
3170 spin_lock_init(&mdev->al_lock);
3171 spin_lock_init(&mdev->req_lock);
3172 spin_lock_init(&mdev->peer_seq_lock);
3173 spin_lock_init(&mdev->epoch_lock);
3175 INIT_LIST_HEAD(&mdev->active_ee);
3176 INIT_LIST_HEAD(&mdev->sync_ee);
3177 INIT_LIST_HEAD(&mdev->done_ee);
3178 INIT_LIST_HEAD(&mdev->read_ee);
3179 INIT_LIST_HEAD(&mdev->net_ee);
3180 INIT_LIST_HEAD(&mdev->resync_reads);
3181 INIT_LIST_HEAD(&mdev->data.work.q);
3182 INIT_LIST_HEAD(&mdev->meta.work.q);
3183 INIT_LIST_HEAD(&mdev->resync_work.list);
3184 INIT_LIST_HEAD(&mdev->unplug_work.list);
3185 INIT_LIST_HEAD(&mdev->go_diskless.list);
3186 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3187 INIT_LIST_HEAD(&mdev->start_resync_work.list);
3188 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3190 mdev->resync_work.cb = w_resync_timer;
3191 mdev->unplug_work.cb = w_send_write_hint;
3192 mdev->go_diskless.cb = w_go_diskless;
3193 mdev->md_sync_work.cb = w_md_sync;
3194 mdev->bm_io_work.w.cb = w_bitmap_io;
3195 mdev->start_resync_work.cb = w_start_resync;
3196 init_timer(&mdev->resync_timer);
3197 init_timer(&mdev->md_sync_timer);
3198 init_timer(&mdev->start_resync_timer);
3199 init_timer(&mdev->request_timer);
3200 mdev->resync_timer.function = resync_timer_fn;
3201 mdev->resync_timer.data = (unsigned long) mdev;
3202 mdev->md_sync_timer.function = md_sync_timer_fn;
3203 mdev->md_sync_timer.data = (unsigned long) mdev;
3204 mdev->start_resync_timer.function = start_resync_timer_fn;
3205 mdev->start_resync_timer.data = (unsigned long) mdev;
3206 mdev->request_timer.function = request_timer_fn;
3207 mdev->request_timer.data = (unsigned long) mdev;
3209 init_waitqueue_head(&mdev->misc_wait);
3210 init_waitqueue_head(&mdev->state_wait);
3211 init_waitqueue_head(&mdev->net_cnt_wait);
3212 init_waitqueue_head(&mdev->ee_wait);
3213 init_waitqueue_head(&mdev->al_wait);
3214 init_waitqueue_head(&mdev->seq_wait);
3216 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3217 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3218 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3220 mdev->agreed_pro_version = PRO_VERSION_MAX;
3221 mdev->write_ordering = WO_bdev_flush;
3222 mdev->resync_wenr = LC_FREE;
3223 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3224 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3227 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3230 if (mdev->receiver.t_state != None)
3231 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3232 mdev->receiver.t_state);
3234 /* no need to lock it, I'm the only thread alive */
3235 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3236 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3246 mdev->rs_failed = 0;
3247 mdev->rs_last_events = 0;
3248 mdev->rs_last_sect_ev = 0;
3249 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3250 mdev->rs_mark_left[i] = 0;
3251 mdev->rs_mark_time[i] = 0;
3253 D_ASSERT(mdev->net_conf == NULL);
3255 drbd_set_my_capacity(mdev, 0);
3257 /* maybe never allocated. */
3258 drbd_bm_resize(mdev, 0, 1);
3259 drbd_bm_cleanup(mdev);
3262 drbd_free_resources(mdev);
3263 clear_bit(AL_SUSPENDED, &mdev->flags);
3266 * currently we drbd_init_ee only on module load, so
3267 * we may do drbd_release_ee only on module unload!
3269 D_ASSERT(list_empty(&mdev->active_ee));
3270 D_ASSERT(list_empty(&mdev->sync_ee));
3271 D_ASSERT(list_empty(&mdev->done_ee));
3272 D_ASSERT(list_empty(&mdev->read_ee));
3273 D_ASSERT(list_empty(&mdev->net_ee));
3274 D_ASSERT(list_empty(&mdev->resync_reads));
3275 D_ASSERT(list_empty(&mdev->data.work.q));
3276 D_ASSERT(list_empty(&mdev->meta.work.q));
3277 D_ASSERT(list_empty(&mdev->resync_work.list));
3278 D_ASSERT(list_empty(&mdev->unplug_work.list));
3279 D_ASSERT(list_empty(&mdev->go_diskless.list));
3281 drbd_set_defaults(mdev);
3285 static void drbd_destroy_mempools(void)
3289 while (drbd_pp_pool) {
3290 page = drbd_pp_pool;
3291 drbd_pp_pool = (struct page *)page_private(page);
3296 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3298 if (drbd_md_io_bio_set)
3299 bioset_free(drbd_md_io_bio_set);
3300 if (drbd_md_io_page_pool)
3301 mempool_destroy(drbd_md_io_page_pool);
3302 if (drbd_ee_mempool)
3303 mempool_destroy(drbd_ee_mempool);
3304 if (drbd_request_mempool)
3305 mempool_destroy(drbd_request_mempool);
3307 kmem_cache_destroy(drbd_ee_cache);
3308 if (drbd_request_cache)
3309 kmem_cache_destroy(drbd_request_cache);
3310 if (drbd_bm_ext_cache)
3311 kmem_cache_destroy(drbd_bm_ext_cache);
3312 if (drbd_al_ext_cache)
3313 kmem_cache_destroy(drbd_al_ext_cache);
3315 drbd_md_io_bio_set = NULL;
3316 drbd_md_io_page_pool = NULL;
3317 drbd_ee_mempool = NULL;
3318 drbd_request_mempool = NULL;
3319 drbd_ee_cache = NULL;
3320 drbd_request_cache = NULL;
3321 drbd_bm_ext_cache = NULL;
3322 drbd_al_ext_cache = NULL;
3327 static int drbd_create_mempools(void)
3330 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3333 /* prepare our caches and mempools */
3334 drbd_request_mempool = NULL;
3335 drbd_ee_cache = NULL;
3336 drbd_request_cache = NULL;
3337 drbd_bm_ext_cache = NULL;
3338 drbd_al_ext_cache = NULL;
3339 drbd_pp_pool = NULL;
3340 drbd_md_io_page_pool = NULL;
3341 drbd_md_io_bio_set = NULL;
3344 drbd_request_cache = kmem_cache_create(
3345 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3346 if (drbd_request_cache == NULL)
3349 drbd_ee_cache = kmem_cache_create(
3350 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3351 if (drbd_ee_cache == NULL)
3354 drbd_bm_ext_cache = kmem_cache_create(
3355 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3356 if (drbd_bm_ext_cache == NULL)
3359 drbd_al_ext_cache = kmem_cache_create(
3360 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3361 if (drbd_al_ext_cache == NULL)
3365 #ifdef COMPAT_HAVE_BIOSET_CREATE
3366 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3367 if (drbd_md_io_bio_set == NULL)
3371 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3372 if (drbd_md_io_page_pool == NULL)
3375 drbd_request_mempool = mempool_create(number,
3376 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3377 if (drbd_request_mempool == NULL)
3380 drbd_ee_mempool = mempool_create(number,
3381 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3382 if (drbd_ee_mempool == NULL)
3385 /* drbd's page pool */
3386 spin_lock_init(&drbd_pp_lock);
3388 for (i = 0; i < number; i++) {
3389 page = alloc_page(GFP_HIGHUSER);
3392 set_page_private(page, (unsigned long)drbd_pp_pool);
3393 drbd_pp_pool = page;
3395 drbd_pp_vacant = number;
3400 drbd_destroy_mempools(); /* in case we allocated some */
3404 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3407 /* just so we have it. you never know what interesting things we
3408 * might want to do here some day...
3414 static struct notifier_block drbd_notifier = {
3415 .notifier_call = drbd_notify_sys,
3418 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3422 rr = drbd_release_ee(mdev, &mdev->active_ee);
3424 dev_err(DEV, "%d EEs in active list found!\n", rr);
3426 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3428 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3430 rr = drbd_release_ee(mdev, &mdev->read_ee);
3432 dev_err(DEV, "%d EEs in read list found!\n", rr);
3434 rr = drbd_release_ee(mdev, &mdev->done_ee);
3436 dev_err(DEV, "%d EEs in done list found!\n", rr);
3438 rr = drbd_release_ee(mdev, &mdev->net_ee);
3440 dev_err(DEV, "%d EEs in net list found!\n", rr);
3443 /* caution. no locking.
3444 * currently only used from module cleanup code. */
3445 static void drbd_delete_device(unsigned int minor)
3447 struct drbd_conf *mdev = minor_to_mdev(minor);
3452 del_timer_sync(&mdev->request_timer);
3454 /* paranoia asserts */
3455 if (mdev->open_cnt != 0)
3456 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3457 __FILE__ , __LINE__);
3459 ERR_IF (!list_empty(&mdev->data.work.q)) {
3460 struct list_head *lp;
3461 list_for_each(lp, &mdev->data.work.q) {
3462 dev_err(DEV, "lp = %p\n", lp);
3465 /* end paranoia asserts */
3467 del_gendisk(mdev->vdisk);
3469 /* cleanup stuff that may have been allocated during
3470 * device (re-)configuration or state changes */
3472 if (mdev->this_bdev)
3473 bdput(mdev->this_bdev);
3475 drbd_free_resources(mdev);
3477 drbd_release_ee_lists(mdev);
3479 /* should be freed on disconnect? */
3480 kfree(mdev->ee_hash);
3482 mdev->ee_hash_s = 0;
3483 mdev->ee_hash = NULL;
3486 lc_destroy(mdev->act_log);
3487 lc_destroy(mdev->resync);
3489 kfree(mdev->p_uuid);
3490 /* mdev->p_uuid = NULL; */
3492 kfree(mdev->int_dig_out);
3493 kfree(mdev->int_dig_in);
3494 kfree(mdev->int_dig_vv);
3496 /* cleanup the rest that has been
3497 * allocated from drbd_new_device
3498 * and actually free the mdev itself */
3499 drbd_free_mdev(mdev);
3502 static void drbd_cleanup(void)
3506 unregister_reboot_notifier(&drbd_notifier);
3508 /* first remove proc,
3509 * drbdsetup uses it's presence to detect
3510 * whether DRBD is loaded.
3511 * If we would get stuck in proc removal,
3512 * but have netlink already deregistered,
3513 * some drbdsetup commands may wait forever
3517 remove_proc_entry("drbd", NULL);
3524 drbd_delete_device(i);
3525 drbd_destroy_mempools();
3530 unregister_blkdev(DRBD_MAJOR, "drbd");
3532 printk(KERN_INFO "drbd: module cleanup done.\n");
3536 * drbd_congested() - Callback for the flusher thread
3537 * @congested_data: User data
3538 * @bdi_bits: Bits the BDI flusher thread is currently interested in
3540 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3542 static int drbd_congested(void *congested_data, int bdi_bits)
3544 struct drbd_conf *mdev = congested_data;
3545 struct request_queue *q;
3549 if (!may_inc_ap_bio(mdev)) {
3550 /* DRBD has frozen IO */
3556 if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
3557 r |= (1 << BDI_async_congested);
3558 /* Without good local data, we would need to read from remote,
3559 * and that would need the worker thread as well, which is
3560 * currently blocked waiting for that usermode helper to
3563 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3564 r |= (1 << BDI_sync_congested);
3572 if (get_ldev(mdev)) {
3573 q = bdev_get_queue(mdev->ldev->backing_bdev);
3574 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3580 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3581 r |= (1 << BDI_async_congested);
3582 reason = reason == 'b' ? 'a' : 'n';
3586 mdev->congestion_reason = reason;
3590 struct drbd_conf *drbd_new_device(unsigned int minor)
3592 struct drbd_conf *mdev;
3593 struct gendisk *disk;
3594 struct request_queue *q;
3596 /* GFP_KERNEL, we are outside of all write-out paths */
3597 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3600 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3601 goto out_no_cpumask;
3603 mdev->minor = minor;
3605 drbd_init_set_defaults(mdev);
3607 q = blk_alloc_queue(GFP_KERNEL);
3611 q->queuedata = mdev;
3613 disk = alloc_disk(1);
3618 set_disk_ro(disk, true);
3621 disk->major = DRBD_MAJOR;
3622 disk->first_minor = minor;
3623 disk->fops = &drbd_ops;
3624 sprintf(disk->disk_name, "drbd%d", minor);
3625 disk->private_data = mdev;
3627 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3628 /* we have no partitions. we contain only ourselves. */
3629 mdev->this_bdev->bd_contains = mdev->this_bdev;
3631 q->backing_dev_info.congested_fn = drbd_congested;
3632 q->backing_dev_info.congested_data = mdev;
3634 blk_queue_make_request(q, drbd_make_request);
3635 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
3636 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3637 This triggers a max_bio_size message upon first attach or connect */
3638 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3639 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3640 blk_queue_merge_bvec(q, drbd_merge_bvec);
3641 q->queue_lock = &mdev->req_lock;
3643 mdev->md_io_page = alloc_page(GFP_KERNEL);
3644 if (!mdev->md_io_page)
3645 goto out_no_io_page;
3647 if (drbd_bm_init(mdev))
3649 /* no need to lock access, we are still initializing this minor device. */
3653 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3654 if (!mdev->app_reads_hash)
3655 goto out_no_app_reads;
3657 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3658 if (!mdev->current_epoch)
3661 INIT_LIST_HEAD(&mdev->current_epoch->list);
3666 /* out_whatever_else:
3667 kfree(mdev->current_epoch); */
3669 kfree(mdev->app_reads_hash);
3673 drbd_bm_cleanup(mdev);
3675 __free_page(mdev->md_io_page);
3679 blk_cleanup_queue(q);
3681 free_cpumask_var(mdev->cpu_mask);
3687 /* counterpart of drbd_new_device.
3688 * last part of drbd_delete_device. */
3689 void drbd_free_mdev(struct drbd_conf *mdev)
3691 kfree(mdev->current_epoch);
3692 kfree(mdev->app_reads_hash);
3694 if (mdev->bitmap) /* should no longer be there. */
3695 drbd_bm_cleanup(mdev);
3696 __free_page(mdev->md_io_page);
3697 put_disk(mdev->vdisk);
3698 blk_cleanup_queue(mdev->rq_queue);
3699 free_cpumask_var(mdev->cpu_mask);
3700 drbd_free_tl_hash(mdev);
3705 int __init drbd_init(void)
3709 if (sizeof(struct p_handshake) != 80) {
3711 "drbd: never change the size or layout "
3712 "of the HandShake packet.\n");
3716 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3718 "drbd: invalid minor_count (%d)\n", minor_count);
3726 err = drbd_nl_init();
3730 err = register_blkdev(DRBD_MAJOR, "drbd");
3733 "drbd: unable to register block device major %d\n",
3738 register_reboot_notifier(&drbd_notifier);
3741 * allocate all necessary structs
3745 init_waitqueue_head(&drbd_pp_wait);
3747 drbd_proc = NULL; /* play safe for drbd_cleanup */
3748 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3753 err = drbd_create_mempools();
3757 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3759 printk(KERN_ERR "drbd: unable to register proc file\n");
3763 rwlock_init(&global_state_lock);
3765 printk(KERN_INFO "drbd: initialized. "
3766 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3767 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3768 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3769 printk(KERN_INFO "drbd: registered as block device major %d\n",
3771 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3773 return 0; /* Success! */
3778 /* currently always the case */
3779 printk(KERN_ERR "drbd: ran out of memory\n");
3781 printk(KERN_ERR "drbd: initialization failure\n");
3785 void drbd_free_bc(struct drbd_backing_dev *ldev)
3790 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3791 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3796 void drbd_free_sock(struct drbd_conf *mdev)
3798 if (mdev->data.socket) {
3799 mutex_lock(&mdev->data.mutex);
3800 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3801 sock_release(mdev->data.socket);
3802 mdev->data.socket = NULL;
3803 mutex_unlock(&mdev->data.mutex);
3805 if (mdev->meta.socket) {
3806 mutex_lock(&mdev->meta.mutex);
3807 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3808 sock_release(mdev->meta.socket);
3809 mdev->meta.socket = NULL;
3810 mutex_unlock(&mdev->meta.mutex);
3815 void drbd_free_resources(struct drbd_conf *mdev)
3817 crypto_free_hash(mdev->csums_tfm);
3818 mdev->csums_tfm = NULL;
3819 crypto_free_hash(mdev->verify_tfm);
3820 mdev->verify_tfm = NULL;
3821 crypto_free_hash(mdev->cram_hmac_tfm);
3822 mdev->cram_hmac_tfm = NULL;
3823 crypto_free_hash(mdev->integrity_w_tfm);
3824 mdev->integrity_w_tfm = NULL;
3825 crypto_free_hash(mdev->integrity_r_tfm);
3826 mdev->integrity_r_tfm = NULL;
3828 drbd_free_sock(mdev);
3831 drbd_free_bc(mdev->ldev);
3832 mdev->ldev = NULL;);
3835 /* meta data management */
3837 struct meta_data_on_disk {
3838 u64 la_size; /* last agreed size. */
3839 u64 uuid[UI_SIZE]; /* UUIDs. */
3842 u32 flags; /* MDF */
3845 u32 al_offset; /* offset to this block */
3846 u32 al_nr_extents; /* important for restoring the AL */
3847 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3848 u32 bm_offset; /* offset to the bitmap, from here */
3849 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3850 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3851 u32 reserved_u32[3];
3856 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3857 * @mdev: DRBD device.
3859 void drbd_md_sync(struct drbd_conf *mdev)
3861 struct meta_data_on_disk *buffer;
3865 del_timer(&mdev->md_sync_timer);
3866 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3867 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3870 /* We use here D_FAILED and not D_ATTACHING because we try to write
3871 * metadata even if we detach due to a disk failure! */
3872 if (!get_ldev_if_state(mdev, D_FAILED))
3875 buffer = drbd_md_get_buffer(mdev);
3879 memset(buffer, 0, 512);
3881 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3882 for (i = UI_CURRENT; i < UI_SIZE; i++)
3883 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3884 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3885 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3887 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3888 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3889 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3890 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3891 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3893 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3894 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3896 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3897 sector = mdev->ldev->md.md_offset;
3899 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3900 /* this was a try anyways ... */
3901 dev_err(DEV, "meta data update failed!\n");
3902 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
3905 /* Update mdev->ldev->md.la_size_sect,
3906 * since we updated it on metadata. */
3907 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3909 drbd_md_put_buffer(mdev);
3915 * drbd_md_read() - Reads in the meta data super block
3916 * @mdev: DRBD device.
3917 * @bdev: Device from which the meta data should be read in.
3919 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3920 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3922 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3924 struct meta_data_on_disk *buffer;
3925 int i, rv = NO_ERROR;
3927 if (!get_ldev_if_state(mdev, D_ATTACHING))
3928 return ERR_IO_MD_DISK;
3930 buffer = drbd_md_get_buffer(mdev);
3934 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3935 /* NOTE: can't do normal error processing here as this is
3936 called BEFORE disk is attached */
3937 dev_err(DEV, "Error while reading metadata.\n");
3938 rv = ERR_IO_MD_DISK;
3942 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3943 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3944 rv = ERR_MD_INVALID;
3947 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3948 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3949 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3950 rv = ERR_MD_INVALID;
3953 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3954 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3955 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3956 rv = ERR_MD_INVALID;
3959 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3960 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3961 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3962 rv = ERR_MD_INVALID;
3966 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3967 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3968 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3969 rv = ERR_MD_INVALID;
3973 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3974 for (i = UI_CURRENT; i < UI_SIZE; i++)
3975 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3976 bdev->md.flags = be32_to_cpu(buffer->flags);
3977 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3978 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3980 spin_lock_irq(&mdev->req_lock);
3981 if (mdev->state.conn < C_CONNECTED) {
3983 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3984 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3985 mdev->peer_max_bio_size = peer;
3987 spin_unlock_irq(&mdev->req_lock);
3989 if (mdev->sync_conf.al_extents < 7)
3990 mdev->sync_conf.al_extents = 127;
3993 drbd_md_put_buffer(mdev);
4001 * drbd_md_mark_dirty() - Mark meta data super block as dirty
4002 * @mdev: DRBD device.
4004 * Call this function if you change anything that should be written to
4005 * the meta-data super block. This function sets MD_DIRTY, and starts a
4006 * timer that ensures that within five seconds you have to call drbd_md_sync().
4009 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4011 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
4012 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4013 mdev->last_md_mark_dirty.line = line;
4014 mdev->last_md_mark_dirty.func = func;
4018 void drbd_md_mark_dirty(struct drbd_conf *mdev)
4020 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
4021 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
4025 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
4029 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
4030 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
4033 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4035 if (idx == UI_CURRENT) {
4036 if (mdev->state.role == R_PRIMARY)
4041 drbd_set_ed_uuid(mdev, val);
4044 mdev->ldev->md.uuid[idx] = val;
4045 drbd_md_mark_dirty(mdev);
4049 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4051 if (mdev->ldev->md.uuid[idx]) {
4052 drbd_uuid_move_history(mdev);
4053 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
4055 _drbd_uuid_set(mdev, idx, val);
4059 * drbd_uuid_new_current() - Creates a new current UUID
4060 * @mdev: DRBD device.
4062 * Creates a new current UUID, and rotates the old current UUID into
4063 * the bitmap slot. Causes an incremental resync upon next connect.
4065 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4068 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4071 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4073 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
4075 get_random_bytes(&val, sizeof(u64));
4076 _drbd_uuid_set(mdev, UI_CURRENT, val);
4077 drbd_print_uuids(mdev, "new current UUID");
4078 /* get it to stable storage _now_ */
4082 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4084 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4088 drbd_uuid_move_history(mdev);
4089 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4090 mdev->ldev->md.uuid[UI_BITMAP] = 0;
4092 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4094 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4096 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
4098 drbd_md_mark_dirty(mdev);
4102 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4103 * @mdev: DRBD device.
4105 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4107 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4111 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4112 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4114 drbd_bm_set_all(mdev);
4116 rv = drbd_bm_write(mdev);
4119 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4130 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4131 * @mdev: DRBD device.
4133 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4135 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4139 drbd_resume_al(mdev);
4140 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4141 drbd_bm_clear_all(mdev);
4142 rv = drbd_bm_write(mdev);
4149 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4151 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
4154 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4156 if (get_ldev(mdev)) {
4157 drbd_bm_lock(mdev, work->why, work->flags);
4158 rv = work->io_fn(mdev);
4159 drbd_bm_unlock(mdev);
4163 clear_bit(BITMAP_IO, &mdev->flags);
4164 smp_mb__after_clear_bit();
4165 wake_up(&mdev->misc_wait);
4168 work->done(mdev, rv);
4170 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4177 void drbd_ldev_destroy(struct drbd_conf *mdev)
4179 lc_destroy(mdev->resync);
4180 mdev->resync = NULL;
4181 lc_destroy(mdev->act_log);
4182 mdev->act_log = NULL;
4184 drbd_free_bc(mdev->ldev);
4185 mdev->ldev = NULL;);
4187 if (mdev->md_io_tmpp) {
4188 __free_page(mdev->md_io_tmpp);
4189 mdev->md_io_tmpp = NULL;
4191 clear_bit(GO_DISKLESS, &mdev->flags);
4194 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4196 D_ASSERT(mdev->state.disk == D_FAILED);
4197 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4198 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
4199 * the protected members anymore, though, so once put_ldev reaches zero
4200 * again, it will be safe to free them. */
4201 drbd_force_state(mdev, NS(disk, D_DISKLESS));
4205 void drbd_go_diskless(struct drbd_conf *mdev)
4207 D_ASSERT(mdev->state.disk == D_FAILED);
4208 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
4209 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
4213 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4214 * @mdev: DRBD device.
4215 * @io_fn: IO callback to be called when bitmap IO is possible
4216 * @done: callback to be called after the bitmap IO was performed
4217 * @why: Descriptive text of the reason for doing the IO
4219 * While IO on the bitmap happens we freeze application IO thus we ensure
4220 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4221 * called from worker context. It MUST NOT be used while a previous such
4222 * work is still pending!
4224 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4225 int (*io_fn)(struct drbd_conf *),
4226 void (*done)(struct drbd_conf *, int),
4227 char *why, enum bm_flag flags)
4229 D_ASSERT(current == mdev->worker.task);
4231 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4232 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4233 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4234 if (mdev->bm_io_work.why)
4235 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4236 why, mdev->bm_io_work.why);
4238 mdev->bm_io_work.io_fn = io_fn;
4239 mdev->bm_io_work.done = done;
4240 mdev->bm_io_work.why = why;
4241 mdev->bm_io_work.flags = flags;
4243 spin_lock_irq(&mdev->req_lock);
4244 set_bit(BITMAP_IO, &mdev->flags);
4245 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4246 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
4247 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
4249 spin_unlock_irq(&mdev->req_lock);
4253 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4254 * @mdev: DRBD device.
4255 * @io_fn: IO callback to be called when bitmap IO is possible
4256 * @why: Descriptive text of the reason for doing the IO
4258 * freezes application IO while that the actual IO operations runs. This
4259 * functions MAY NOT be called from worker context.
4261 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4262 char *why, enum bm_flag flags)
4266 D_ASSERT(current != mdev->worker.task);
4268 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4269 drbd_suspend_io(mdev);
4271 drbd_bm_lock(mdev, why, flags);
4273 drbd_bm_unlock(mdev);
4275 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4276 drbd_resume_io(mdev);
4281 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4283 if ((mdev->ldev->md.flags & flag) != flag) {
4284 drbd_md_mark_dirty(mdev);
4285 mdev->ldev->md.flags |= flag;
4289 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4291 if ((mdev->ldev->md.flags & flag) != 0) {
4292 drbd_md_mark_dirty(mdev);
4293 mdev->ldev->md.flags &= ~flag;
4296 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4298 return (bdev->md.flags & flag) != 0;
4301 static void md_sync_timer_fn(unsigned long data)
4303 struct drbd_conf *mdev = (struct drbd_conf *) data;
4305 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4308 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4310 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4312 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4313 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4319 #ifdef CONFIG_DRBD_FAULT_INJECTION
4320 /* Fault insertion support including random number generator shamelessly
4321 * stolen from kernel/rcutorture.c */
4322 struct fault_random_state {
4323 unsigned long state;
4324 unsigned long count;
4327 #define FAULT_RANDOM_MULT 39916801 /* prime */
4328 #define FAULT_RANDOM_ADD 479001701 /* prime */
4329 #define FAULT_RANDOM_REFRESH 10000
4332 * Crude but fast random-number generator. Uses a linear congruential
4333 * generator, with occasional help from get_random_bytes().
4335 static unsigned long
4336 _drbd_fault_random(struct fault_random_state *rsp)
4340 if (!rsp->count--) {
4341 get_random_bytes(&refresh, sizeof(refresh));
4342 rsp->state += refresh;
4343 rsp->count = FAULT_RANDOM_REFRESH;
4345 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4346 return swahw32(rsp->state);
4350 _drbd_fault_str(unsigned int type) {
4351 static char *_faults[] = {
4352 [DRBD_FAULT_MD_WR] = "Meta-data write",
4353 [DRBD_FAULT_MD_RD] = "Meta-data read",
4354 [DRBD_FAULT_RS_WR] = "Resync write",
4355 [DRBD_FAULT_RS_RD] = "Resync read",
4356 [DRBD_FAULT_DT_WR] = "Data write",
4357 [DRBD_FAULT_DT_RD] = "Data read",
4358 [DRBD_FAULT_DT_RA] = "Data read ahead",
4359 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4360 [DRBD_FAULT_AL_EE] = "EE allocation",
4361 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4364 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4368 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4370 static struct fault_random_state rrs = {0, 0};
4372 unsigned int ret = (
4374 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4375 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4380 if (__ratelimit(&drbd_ratelimit_state))
4381 dev_warn(DEV, "***Simulating %s failure\n",
4382 _drbd_fault_str(type));
4389 const char *drbd_buildtag(void)
4391 /* DRBD built from external sources has here a reference to the
4392 git hash of the source code. */
4394 static char buildtag[38] = "\0uilt-in";
4396 if (buildtag[0] == 0) {
4398 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4407 module_init(drbd_init)
4408 module_exit(drbd_cleanup)
4410 EXPORT_SYMBOL(drbd_conn_str);
4411 EXPORT_SYMBOL(drbd_role_str);
4412 EXPORT_SYMBOL(drbd_disk_str);
4413 EXPORT_SYMBOL(drbd_set_st_err_str);