]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/xfs/xfs_log.c
Merge tag 'sh-for-linus' of git://github.com/pmundt/linux-sh
[karo-tx-linux.git] / fs / xfs / xfs_log.c
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_trans.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_mount.h"
28 #include "xfs_error.h"
29 #include "xfs_log_priv.h"
30 #include "xfs_buf_item.h"
31 #include "xfs_bmap_btree.h"
32 #include "xfs_alloc_btree.h"
33 #include "xfs_ialloc_btree.h"
34 #include "xfs_log_recover.h"
35 #include "xfs_trans_priv.h"
36 #include "xfs_dinode.h"
37 #include "xfs_inode.h"
38 #include "xfs_rw.h"
39 #include "xfs_trace.h"
40
41 kmem_zone_t     *xfs_log_ticket_zone;
42
43 /* Local miscellaneous function prototypes */
44 STATIC int       xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
45                                     xlog_in_core_t **, xfs_lsn_t *);
46 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
47                                 xfs_buftarg_t   *log_target,
48                                 xfs_daddr_t     blk_offset,
49                                 int             num_bblks);
50 STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
51 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52 STATIC void      xlog_dealloc_log(xlog_t *log);
53
54 /* local state machine functions */
55 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
56 STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
57 STATIC int  xlog_state_get_iclog_space(xlog_t           *log,
58                                        int              len,
59                                        xlog_in_core_t   **iclog,
60                                        xlog_ticket_t    *ticket,
61                                        int              *continued_write,
62                                        int              *logoffsetp);
63 STATIC int  xlog_state_release_iclog(xlog_t             *log,
64                                      xlog_in_core_t     *iclog);
65 STATIC void xlog_state_switch_iclogs(xlog_t             *log,
66                                      xlog_in_core_t *iclog,
67                                      int                eventual_size);
68 STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
69
70 STATIC void xlog_grant_push_ail(struct log      *log,
71                                 int             need_bytes);
72 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
73                                            xlog_ticket_t *ticket);
74 STATIC void xlog_ungrant_log_space(xlog_t        *log,
75                                    xlog_ticket_t *ticket);
76
77 #if defined(DEBUG)
78 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
79 STATIC void     xlog_verify_grant_tail(struct log *log);
80 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
81                                   int count, boolean_t syncing);
82 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
83                                      xfs_lsn_t tail_lsn);
84 #else
85 #define xlog_verify_dest_ptr(a,b)
86 #define xlog_verify_grant_tail(a)
87 #define xlog_verify_iclog(a,b,c,d)
88 #define xlog_verify_tail_lsn(a,b,c)
89 #endif
90
91 STATIC int      xlog_iclogs_empty(xlog_t *log);
92
93 static void
94 xlog_grant_sub_space(
95         struct log      *log,
96         atomic64_t      *head,
97         int             bytes)
98 {
99         int64_t head_val = atomic64_read(head);
100         int64_t new, old;
101
102         do {
103                 int     cycle, space;
104
105                 xlog_crack_grant_head_val(head_val, &cycle, &space);
106
107                 space -= bytes;
108                 if (space < 0) {
109                         space += log->l_logsize;
110                         cycle--;
111                 }
112
113                 old = head_val;
114                 new = xlog_assign_grant_head_val(cycle, space);
115                 head_val = atomic64_cmpxchg(head, old, new);
116         } while (head_val != old);
117 }
118
119 static void
120 xlog_grant_add_space(
121         struct log      *log,
122         atomic64_t      *head,
123         int             bytes)
124 {
125         int64_t head_val = atomic64_read(head);
126         int64_t new, old;
127
128         do {
129                 int             tmp;
130                 int             cycle, space;
131
132                 xlog_crack_grant_head_val(head_val, &cycle, &space);
133
134                 tmp = log->l_logsize - space;
135                 if (tmp > bytes)
136                         space += bytes;
137                 else {
138                         space = bytes - tmp;
139                         cycle++;
140                 }
141
142                 old = head_val;
143                 new = xlog_assign_grant_head_val(cycle, space);
144                 head_val = atomic64_cmpxchg(head, old, new);
145         } while (head_val != old);
146 }
147
148 STATIC void
149 xlog_grant_head_init(
150         struct xlog_grant_head  *head)
151 {
152         xlog_assign_grant_head(&head->grant, 1, 0);
153         INIT_LIST_HEAD(&head->waiters);
154         spin_lock_init(&head->lock);
155 }
156
157 STATIC void
158 xlog_grant_head_wake_all(
159         struct xlog_grant_head  *head)
160 {
161         struct xlog_ticket      *tic;
162
163         spin_lock(&head->lock);
164         list_for_each_entry(tic, &head->waiters, t_queue)
165                 wake_up_process(tic->t_task);
166         spin_unlock(&head->lock);
167 }
168
169 static inline int
170 xlog_ticket_reservation(
171         struct log              *log,
172         struct xlog_grant_head  *head,
173         struct xlog_ticket      *tic)
174 {
175         if (head == &log->l_write_head) {
176                 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
177                 return tic->t_unit_res;
178         } else {
179                 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
180                         return tic->t_unit_res * tic->t_cnt;
181                 else
182                         return tic->t_unit_res;
183         }
184 }
185
186 STATIC bool
187 xlog_grant_head_wake(
188         struct log              *log,
189         struct xlog_grant_head  *head,
190         int                     *free_bytes)
191 {
192         struct xlog_ticket      *tic;
193         int                     need_bytes;
194
195         list_for_each_entry(tic, &head->waiters, t_queue) {
196                 need_bytes = xlog_ticket_reservation(log, head, tic);
197                 if (*free_bytes < need_bytes)
198                         return false;
199
200                 *free_bytes -= need_bytes;
201                 trace_xfs_log_grant_wake_up(log, tic);
202                 wake_up_process(tic->t_task);
203         }
204
205         return true;
206 }
207
208 STATIC int
209 xlog_grant_head_wait(
210         struct log              *log,
211         struct xlog_grant_head  *head,
212         struct xlog_ticket      *tic,
213         int                     need_bytes)
214 {
215         list_add_tail(&tic->t_queue, &head->waiters);
216
217         do {
218                 if (XLOG_FORCED_SHUTDOWN(log))
219                         goto shutdown;
220                 xlog_grant_push_ail(log, need_bytes);
221
222                 __set_current_state(TASK_UNINTERRUPTIBLE);
223                 spin_unlock(&head->lock);
224
225                 XFS_STATS_INC(xs_sleep_logspace);
226
227                 trace_xfs_log_grant_sleep(log, tic);
228                 schedule();
229                 trace_xfs_log_grant_wake(log, tic);
230
231                 spin_lock(&head->lock);
232                 if (XLOG_FORCED_SHUTDOWN(log))
233                         goto shutdown;
234         } while (xlog_space_left(log, &head->grant) < need_bytes);
235
236         list_del_init(&tic->t_queue);
237         return 0;
238 shutdown:
239         list_del_init(&tic->t_queue);
240         return XFS_ERROR(EIO);
241 }
242
243 /*
244  * Atomically get the log space required for a log ticket.
245  *
246  * Once a ticket gets put onto head->waiters, it will only return after the
247  * needed reservation is satisfied.
248  *
249  * This function is structured so that it has a lock free fast path. This is
250  * necessary because every new transaction reservation will come through this
251  * path. Hence any lock will be globally hot if we take it unconditionally on
252  * every pass.
253  *
254  * As tickets are only ever moved on and off head->waiters under head->lock, we
255  * only need to take that lock if we are going to add the ticket to the queue
256  * and sleep. We can avoid taking the lock if the ticket was never added to
257  * head->waiters because the t_queue list head will be empty and we hold the
258  * only reference to it so it can safely be checked unlocked.
259  */
260 STATIC int
261 xlog_grant_head_check(
262         struct log              *log,
263         struct xlog_grant_head  *head,
264         struct xlog_ticket      *tic,
265         int                     *need_bytes)
266 {
267         int                     free_bytes;
268         int                     error = 0;
269
270         ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
271
272         /*
273          * If there are other waiters on the queue then give them a chance at
274          * logspace before us.  Wake up the first waiters, if we do not wake
275          * up all the waiters then go to sleep waiting for more free space,
276          * otherwise try to get some space for this transaction.
277          */
278         *need_bytes = xlog_ticket_reservation(log, head, tic);
279         free_bytes = xlog_space_left(log, &head->grant);
280         if (!list_empty_careful(&head->waiters)) {
281                 spin_lock(&head->lock);
282                 if (!xlog_grant_head_wake(log, head, &free_bytes) ||
283                     free_bytes < *need_bytes) {
284                         error = xlog_grant_head_wait(log, head, tic,
285                                                      *need_bytes);
286                 }
287                 spin_unlock(&head->lock);
288         } else if (free_bytes < *need_bytes) {
289                 spin_lock(&head->lock);
290                 error = xlog_grant_head_wait(log, head, tic, *need_bytes);
291                 spin_unlock(&head->lock);
292         }
293
294         return error;
295 }
296
297 static void
298 xlog_tic_reset_res(xlog_ticket_t *tic)
299 {
300         tic->t_res_num = 0;
301         tic->t_res_arr_sum = 0;
302         tic->t_res_num_ophdrs = 0;
303 }
304
305 static void
306 xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
307 {
308         if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
309                 /* add to overflow and start again */
310                 tic->t_res_o_flow += tic->t_res_arr_sum;
311                 tic->t_res_num = 0;
312                 tic->t_res_arr_sum = 0;
313         }
314
315         tic->t_res_arr[tic->t_res_num].r_len = len;
316         tic->t_res_arr[tic->t_res_num].r_type = type;
317         tic->t_res_arr_sum += len;
318         tic->t_res_num++;
319 }
320
321 /*
322  * Replenish the byte reservation required by moving the grant write head.
323  */
324 int
325 xfs_log_regrant(
326         struct xfs_mount        *mp,
327         struct xlog_ticket      *tic)
328 {
329         struct log              *log = mp->m_log;
330         int                     need_bytes;
331         int                     error = 0;
332
333         if (XLOG_FORCED_SHUTDOWN(log))
334                 return XFS_ERROR(EIO);
335
336         XFS_STATS_INC(xs_try_logspace);
337
338         /*
339          * This is a new transaction on the ticket, so we need to change the
340          * transaction ID so that the next transaction has a different TID in
341          * the log. Just add one to the existing tid so that we can see chains
342          * of rolling transactions in the log easily.
343          */
344         tic->t_tid++;
345
346         xlog_grant_push_ail(log, tic->t_unit_res);
347
348         tic->t_curr_res = tic->t_unit_res;
349         xlog_tic_reset_res(tic);
350
351         if (tic->t_cnt > 0)
352                 return 0;
353
354         trace_xfs_log_regrant(log, tic);
355
356         error = xlog_grant_head_check(log, &log->l_write_head, tic,
357                                       &need_bytes);
358         if (error)
359                 goto out_error;
360
361         xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
362         trace_xfs_log_regrant_exit(log, tic);
363         xlog_verify_grant_tail(log);
364         return 0;
365
366 out_error:
367         /*
368          * If we are failing, make sure the ticket doesn't have any current
369          * reservations.  We don't want to add this back when the ticket/
370          * transaction gets cancelled.
371          */
372         tic->t_curr_res = 0;
373         tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
374         return error;
375 }
376
377 /*
378  * Reserve log space and return a ticket corresponding the reservation.
379  *
380  * Each reservation is going to reserve extra space for a log record header.
381  * When writes happen to the on-disk log, we don't subtract the length of the
382  * log record header from any reservation.  By wasting space in each
383  * reservation, we prevent over allocation problems.
384  */
385 int
386 xfs_log_reserve(
387         struct xfs_mount        *mp,
388         int                     unit_bytes,
389         int                     cnt,
390         struct xlog_ticket      **ticp,
391         __uint8_t               client,
392         bool                    permanent,
393         uint                    t_type)
394 {
395         struct log              *log = mp->m_log;
396         struct xlog_ticket      *tic;
397         int                     need_bytes;
398         int                     error = 0;
399
400         ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
401
402         if (XLOG_FORCED_SHUTDOWN(log))
403                 return XFS_ERROR(EIO);
404
405         XFS_STATS_INC(xs_try_logspace);
406
407         ASSERT(*ticp == NULL);
408         tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
409                                 KM_SLEEP | KM_MAYFAIL);
410         if (!tic)
411                 return XFS_ERROR(ENOMEM);
412
413         tic->t_trans_type = t_type;
414         *ticp = tic;
415
416         xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
417
418         trace_xfs_log_reserve(log, tic);
419
420         error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
421                                       &need_bytes);
422         if (error)
423                 goto out_error;
424
425         xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
426         xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
427         trace_xfs_log_reserve_exit(log, tic);
428         xlog_verify_grant_tail(log);
429         return 0;
430
431 out_error:
432         /*
433          * If we are failing, make sure the ticket doesn't have any current
434          * reservations.  We don't want to add this back when the ticket/
435          * transaction gets cancelled.
436          */
437         tic->t_curr_res = 0;
438         tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
439         return error;
440 }
441
442
443 /*
444  * NOTES:
445  *
446  *      1. currblock field gets updated at startup and after in-core logs
447  *              marked as with WANT_SYNC.
448  */
449
450 /*
451  * This routine is called when a user of a log manager ticket is done with
452  * the reservation.  If the ticket was ever used, then a commit record for
453  * the associated transaction is written out as a log operation header with
454  * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
455  * a given ticket.  If the ticket was one with a permanent reservation, then
456  * a few operations are done differently.  Permanent reservation tickets by
457  * default don't release the reservation.  They just commit the current
458  * transaction with the belief that the reservation is still needed.  A flag
459  * must be passed in before permanent reservations are actually released.
460  * When these type of tickets are not released, they need to be set into
461  * the inited state again.  By doing this, a start record will be written
462  * out when the next write occurs.
463  */
464 xfs_lsn_t
465 xfs_log_done(
466         struct xfs_mount        *mp,
467         struct xlog_ticket      *ticket,
468         struct xlog_in_core     **iclog,
469         uint                    flags)
470 {
471         struct log              *log = mp->m_log;
472         xfs_lsn_t               lsn = 0;
473
474         if (XLOG_FORCED_SHUTDOWN(log) ||
475             /*
476              * If nothing was ever written, don't write out commit record.
477              * If we get an error, just continue and give back the log ticket.
478              */
479             (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
480              (xlog_commit_record(log, ticket, iclog, &lsn)))) {
481                 lsn = (xfs_lsn_t) -1;
482                 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
483                         flags |= XFS_LOG_REL_PERM_RESERV;
484                 }
485         }
486
487
488         if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
489             (flags & XFS_LOG_REL_PERM_RESERV)) {
490                 trace_xfs_log_done_nonperm(log, ticket);
491
492                 /*
493                  * Release ticket if not permanent reservation or a specific
494                  * request has been made to release a permanent reservation.
495                  */
496                 xlog_ungrant_log_space(log, ticket);
497                 xfs_log_ticket_put(ticket);
498         } else {
499                 trace_xfs_log_done_perm(log, ticket);
500
501                 xlog_regrant_reserve_log_space(log, ticket);
502                 /* If this ticket was a permanent reservation and we aren't
503                  * trying to release it, reset the inited flags; so next time
504                  * we write, a start record will be written out.
505                  */
506                 ticket->t_flags |= XLOG_TIC_INITED;
507         }
508
509         return lsn;
510 }
511
512 /*
513  * Attaches a new iclog I/O completion callback routine during
514  * transaction commit.  If the log is in error state, a non-zero
515  * return code is handed back and the caller is responsible for
516  * executing the callback at an appropriate time.
517  */
518 int
519 xfs_log_notify(
520         struct xfs_mount        *mp,
521         struct xlog_in_core     *iclog,
522         xfs_log_callback_t      *cb)
523 {
524         int     abortflg;
525
526         spin_lock(&iclog->ic_callback_lock);
527         abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
528         if (!abortflg) {
529                 ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
530                               (iclog->ic_state == XLOG_STATE_WANT_SYNC));
531                 cb->cb_next = NULL;
532                 *(iclog->ic_callback_tail) = cb;
533                 iclog->ic_callback_tail = &(cb->cb_next);
534         }
535         spin_unlock(&iclog->ic_callback_lock);
536         return abortflg;
537 }
538
539 int
540 xfs_log_release_iclog(
541         struct xfs_mount        *mp,
542         struct xlog_in_core     *iclog)
543 {
544         if (xlog_state_release_iclog(mp->m_log, iclog)) {
545                 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
546                 return EIO;
547         }
548
549         return 0;
550 }
551
552 /*
553  * Mount a log filesystem
554  *
555  * mp           - ubiquitous xfs mount point structure
556  * log_target   - buftarg of on-disk log device
557  * blk_offset   - Start block # where block size is 512 bytes (BBSIZE)
558  * num_bblocks  - Number of BBSIZE blocks in on-disk log
559  *
560  * Return error or zero.
561  */
562 int
563 xfs_log_mount(
564         xfs_mount_t     *mp,
565         xfs_buftarg_t   *log_target,
566         xfs_daddr_t     blk_offset,
567         int             num_bblks)
568 {
569         int             error;
570
571         if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
572                 xfs_notice(mp, "Mounting Filesystem");
573         else {
574                 xfs_notice(mp,
575 "Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
576                 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
577         }
578
579         mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
580         if (IS_ERR(mp->m_log)) {
581                 error = -PTR_ERR(mp->m_log);
582                 goto out;
583         }
584
585         /*
586          * Initialize the AIL now we have a log.
587          */
588         error = xfs_trans_ail_init(mp);
589         if (error) {
590                 xfs_warn(mp, "AIL initialisation failed: error %d", error);
591                 goto out_free_log;
592         }
593         mp->m_log->l_ailp = mp->m_ail;
594
595         /*
596          * skip log recovery on a norecovery mount.  pretend it all
597          * just worked.
598          */
599         if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
600                 int     readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
601
602                 if (readonly)
603                         mp->m_flags &= ~XFS_MOUNT_RDONLY;
604
605                 error = xlog_recover(mp->m_log);
606
607                 if (readonly)
608                         mp->m_flags |= XFS_MOUNT_RDONLY;
609                 if (error) {
610                         xfs_warn(mp, "log mount/recovery failed: error %d",
611                                 error);
612                         goto out_destroy_ail;
613                 }
614         }
615
616         /* Normal transactions can now occur */
617         mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
618
619         /*
620          * Now the log has been fully initialised and we know were our
621          * space grant counters are, we can initialise the permanent ticket
622          * needed for delayed logging to work.
623          */
624         xlog_cil_init_post_recovery(mp->m_log);
625
626         return 0;
627
628 out_destroy_ail:
629         xfs_trans_ail_destroy(mp);
630 out_free_log:
631         xlog_dealloc_log(mp->m_log);
632 out:
633         return error;
634 }
635
636 /*
637  * Finish the recovery of the file system.  This is separate from
638  * the xfs_log_mount() call, because it depends on the code in
639  * xfs_mountfs() to read in the root and real-time bitmap inodes
640  * between calling xfs_log_mount() and here.
641  *
642  * mp           - ubiquitous xfs mount point structure
643  */
644 int
645 xfs_log_mount_finish(xfs_mount_t *mp)
646 {
647         int     error;
648
649         if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
650                 error = xlog_recover_finish(mp->m_log);
651         else {
652                 error = 0;
653                 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
654         }
655
656         return error;
657 }
658
659 /*
660  * Final log writes as part of unmount.
661  *
662  * Mark the filesystem clean as unmount happens.  Note that during relocation
663  * this routine needs to be executed as part of source-bag while the
664  * deallocation must not be done until source-end.
665  */
666
667 /*
668  * Unmount record used to have a string "Unmount filesystem--" in the
669  * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
670  * We just write the magic number now since that particular field isn't
671  * currently architecture converted and "nUmount" is a bit foo.
672  * As far as I know, there weren't any dependencies on the old behaviour.
673  */
674
675 int
676 xfs_log_unmount_write(xfs_mount_t *mp)
677 {
678         xlog_t           *log = mp->m_log;
679         xlog_in_core_t   *iclog;
680 #ifdef DEBUG
681         xlog_in_core_t   *first_iclog;
682 #endif
683         xlog_ticket_t   *tic = NULL;
684         xfs_lsn_t        lsn;
685         int              error;
686
687         /*
688          * Don't write out unmount record on read-only mounts.
689          * Or, if we are doing a forced umount (typically because of IO errors).
690          */
691         if (mp->m_flags & XFS_MOUNT_RDONLY)
692                 return 0;
693
694         error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
695         ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
696
697 #ifdef DEBUG
698         first_iclog = iclog = log->l_iclog;
699         do {
700                 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
701                         ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
702                         ASSERT(iclog->ic_offset == 0);
703                 }
704                 iclog = iclog->ic_next;
705         } while (iclog != first_iclog);
706 #endif
707         if (! (XLOG_FORCED_SHUTDOWN(log))) {
708                 error = xfs_log_reserve(mp, 600, 1, &tic,
709                                         XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
710                 if (!error) {
711                         /* the data section must be 32 bit size aligned */
712                         struct {
713                             __uint16_t magic;
714                             __uint16_t pad1;
715                             __uint32_t pad2; /* may as well make it 64 bits */
716                         } magic = {
717                                 .magic = XLOG_UNMOUNT_TYPE,
718                         };
719                         struct xfs_log_iovec reg = {
720                                 .i_addr = &magic,
721                                 .i_len = sizeof(magic),
722                                 .i_type = XLOG_REG_TYPE_UNMOUNT,
723                         };
724                         struct xfs_log_vec vec = {
725                                 .lv_niovecs = 1,
726                                 .lv_iovecp = &reg,
727                         };
728
729                         /* remove inited flag, and account for space used */
730                         tic->t_flags = 0;
731                         tic->t_curr_res -= sizeof(magic);
732                         error = xlog_write(log, &vec, tic, &lsn,
733                                            NULL, XLOG_UNMOUNT_TRANS);
734                         /*
735                          * At this point, we're umounting anyway,
736                          * so there's no point in transitioning log state
737                          * to IOERROR. Just continue...
738                          */
739                 }
740
741                 if (error)
742                         xfs_alert(mp, "%s: unmount record failed", __func__);
743
744
745                 spin_lock(&log->l_icloglock);
746                 iclog = log->l_iclog;
747                 atomic_inc(&iclog->ic_refcnt);
748                 xlog_state_want_sync(log, iclog);
749                 spin_unlock(&log->l_icloglock);
750                 error = xlog_state_release_iclog(log, iclog);
751
752                 spin_lock(&log->l_icloglock);
753                 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
754                       iclog->ic_state == XLOG_STATE_DIRTY)) {
755                         if (!XLOG_FORCED_SHUTDOWN(log)) {
756                                 xlog_wait(&iclog->ic_force_wait,
757                                                         &log->l_icloglock);
758                         } else {
759                                 spin_unlock(&log->l_icloglock);
760                         }
761                 } else {
762                         spin_unlock(&log->l_icloglock);
763                 }
764                 if (tic) {
765                         trace_xfs_log_umount_write(log, tic);
766                         xlog_ungrant_log_space(log, tic);
767                         xfs_log_ticket_put(tic);
768                 }
769         } else {
770                 /*
771                  * We're already in forced_shutdown mode, couldn't
772                  * even attempt to write out the unmount transaction.
773                  *
774                  * Go through the motions of sync'ing and releasing
775                  * the iclog, even though no I/O will actually happen,
776                  * we need to wait for other log I/Os that may already
777                  * be in progress.  Do this as a separate section of
778                  * code so we'll know if we ever get stuck here that
779                  * we're in this odd situation of trying to unmount
780                  * a file system that went into forced_shutdown as
781                  * the result of an unmount..
782                  */
783                 spin_lock(&log->l_icloglock);
784                 iclog = log->l_iclog;
785                 atomic_inc(&iclog->ic_refcnt);
786
787                 xlog_state_want_sync(log, iclog);
788                 spin_unlock(&log->l_icloglock);
789                 error =  xlog_state_release_iclog(log, iclog);
790
791                 spin_lock(&log->l_icloglock);
792
793                 if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
794                         || iclog->ic_state == XLOG_STATE_DIRTY
795                         || iclog->ic_state == XLOG_STATE_IOERROR) ) {
796
797                                 xlog_wait(&iclog->ic_force_wait,
798                                                         &log->l_icloglock);
799                 } else {
800                         spin_unlock(&log->l_icloglock);
801                 }
802         }
803
804         return error;
805 }       /* xfs_log_unmount_write */
806
807 /*
808  * Deallocate log structures for unmount/relocation.
809  *
810  * We need to stop the aild from running before we destroy
811  * and deallocate the log as the aild references the log.
812  */
813 void
814 xfs_log_unmount(xfs_mount_t *mp)
815 {
816         xfs_trans_ail_destroy(mp);
817         xlog_dealloc_log(mp->m_log);
818 }
819
820 void
821 xfs_log_item_init(
822         struct xfs_mount        *mp,
823         struct xfs_log_item     *item,
824         int                     type,
825         const struct xfs_item_ops *ops)
826 {
827         item->li_mountp = mp;
828         item->li_ailp = mp->m_ail;
829         item->li_type = type;
830         item->li_ops = ops;
831         item->li_lv = NULL;
832
833         INIT_LIST_HEAD(&item->li_ail);
834         INIT_LIST_HEAD(&item->li_cil);
835 }
836
837 /*
838  * Wake up processes waiting for log space after we have moved the log tail.
839  */
840 void
841 xfs_log_space_wake(
842         struct xfs_mount        *mp)
843 {
844         struct log              *log = mp->m_log;
845         int                     free_bytes;
846
847         if (XLOG_FORCED_SHUTDOWN(log))
848                 return;
849
850         if (!list_empty_careful(&log->l_write_head.waiters)) {
851                 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
852
853                 spin_lock(&log->l_write_head.lock);
854                 free_bytes = xlog_space_left(log, &log->l_write_head.grant);
855                 xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
856                 spin_unlock(&log->l_write_head.lock);
857         }
858
859         if (!list_empty_careful(&log->l_reserve_head.waiters)) {
860                 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
861
862                 spin_lock(&log->l_reserve_head.lock);
863                 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
864                 xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
865                 spin_unlock(&log->l_reserve_head.lock);
866         }
867 }
868
869 /*
870  * Determine if we have a transaction that has gone to disk
871  * that needs to be covered. To begin the transition to the idle state
872  * firstly the log needs to be idle (no AIL and nothing in the iclogs).
873  * If we are then in a state where covering is needed, the caller is informed
874  * that dummy transactions are required to move the log into the idle state.
875  *
876  * Because this is called as part of the sync process, we should also indicate
877  * that dummy transactions should be issued in anything but the covered or
878  * idle states. This ensures that the log tail is accurately reflected in
879  * the log at the end of the sync, hence if a crash occurrs avoids replay
880  * of transactions where the metadata is already on disk.
881  */
882 int
883 xfs_log_need_covered(xfs_mount_t *mp)
884 {
885         int             needed = 0;
886         xlog_t          *log = mp->m_log;
887
888         if (!xfs_fs_writable(mp))
889                 return 0;
890
891         spin_lock(&log->l_icloglock);
892         switch (log->l_covered_state) {
893         case XLOG_STATE_COVER_DONE:
894         case XLOG_STATE_COVER_DONE2:
895         case XLOG_STATE_COVER_IDLE:
896                 break;
897         case XLOG_STATE_COVER_NEED:
898         case XLOG_STATE_COVER_NEED2:
899                 if (!xfs_ail_min_lsn(log->l_ailp) &&
900                     xlog_iclogs_empty(log)) {
901                         if (log->l_covered_state == XLOG_STATE_COVER_NEED)
902                                 log->l_covered_state = XLOG_STATE_COVER_DONE;
903                         else
904                                 log->l_covered_state = XLOG_STATE_COVER_DONE2;
905                 }
906                 /* FALLTHRU */
907         default:
908                 needed = 1;
909                 break;
910         }
911         spin_unlock(&log->l_icloglock);
912         return needed;
913 }
914
915 /*
916  * We may be holding the log iclog lock upon entering this routine.
917  */
918 xfs_lsn_t
919 xlog_assign_tail_lsn(
920         struct xfs_mount        *mp)
921 {
922         xfs_lsn_t               tail_lsn;
923         struct log              *log = mp->m_log;
924
925         /*
926          * To make sure we always have a valid LSN for the log tail we keep
927          * track of the last LSN which was committed in log->l_last_sync_lsn,
928          * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
929          *
930          * If the AIL has been emptied we also need to wake any process
931          * waiting for this condition.
932          */
933         tail_lsn = xfs_ail_min_lsn(mp->m_ail);
934         if (!tail_lsn)
935                 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
936         atomic64_set(&log->l_tail_lsn, tail_lsn);
937         return tail_lsn;
938 }
939
940 /*
941  * Return the space in the log between the tail and the head.  The head
942  * is passed in the cycle/bytes formal parms.  In the special case where
943  * the reserve head has wrapped passed the tail, this calculation is no
944  * longer valid.  In this case, just return 0 which means there is no space
945  * in the log.  This works for all places where this function is called
946  * with the reserve head.  Of course, if the write head were to ever
947  * wrap the tail, we should blow up.  Rather than catch this case here,
948  * we depend on other ASSERTions in other parts of the code.   XXXmiken
949  *
950  * This code also handles the case where the reservation head is behind
951  * the tail.  The details of this case are described below, but the end
952  * result is that we return the size of the log as the amount of space left.
953  */
954 STATIC int
955 xlog_space_left(
956         struct log      *log,
957         atomic64_t      *head)
958 {
959         int             free_bytes;
960         int             tail_bytes;
961         int             tail_cycle;
962         int             head_cycle;
963         int             head_bytes;
964
965         xlog_crack_grant_head(head, &head_cycle, &head_bytes);
966         xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
967         tail_bytes = BBTOB(tail_bytes);
968         if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
969                 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
970         else if (tail_cycle + 1 < head_cycle)
971                 return 0;
972         else if (tail_cycle < head_cycle) {
973                 ASSERT(tail_cycle == (head_cycle - 1));
974                 free_bytes = tail_bytes - head_bytes;
975         } else {
976                 /*
977                  * The reservation head is behind the tail.
978                  * In this case we just want to return the size of the
979                  * log as the amount of space left.
980                  */
981                 xfs_alert(log->l_mp,
982                         "xlog_space_left: head behind tail\n"
983                         "  tail_cycle = %d, tail_bytes = %d\n"
984                         "  GH   cycle = %d, GH   bytes = %d",
985                         tail_cycle, tail_bytes, head_cycle, head_bytes);
986                 ASSERT(0);
987                 free_bytes = log->l_logsize;
988         }
989         return free_bytes;
990 }
991
992
993 /*
994  * Log function which is called when an io completes.
995  *
996  * The log manager needs its own routine, in order to control what
997  * happens with the buffer after the write completes.
998  */
999 void
1000 xlog_iodone(xfs_buf_t *bp)
1001 {
1002         xlog_in_core_t  *iclog = bp->b_fspriv;
1003         xlog_t          *l = iclog->ic_log;
1004         int             aborted = 0;
1005
1006         /*
1007          * Race to shutdown the filesystem if we see an error.
1008          */
1009         if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
1010                         XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
1011                 xfs_buf_ioerror_alert(bp, __func__);
1012                 xfs_buf_stale(bp);
1013                 xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
1014                 /*
1015                  * This flag will be propagated to the trans-committed
1016                  * callback routines to let them know that the log-commit
1017                  * didn't succeed.
1018                  */
1019                 aborted = XFS_LI_ABORTED;
1020         } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
1021                 aborted = XFS_LI_ABORTED;
1022         }
1023
1024         /* log I/O is always issued ASYNC */
1025         ASSERT(XFS_BUF_ISASYNC(bp));
1026         xlog_state_done_syncing(iclog, aborted);
1027         /*
1028          * do not reference the buffer (bp) here as we could race
1029          * with it being freed after writing the unmount record to the
1030          * log.
1031          */
1032
1033 }       /* xlog_iodone */
1034
1035 /*
1036  * Return size of each in-core log record buffer.
1037  *
1038  * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1039  *
1040  * If the filesystem blocksize is too large, we may need to choose a
1041  * larger size since the directory code currently logs entire blocks.
1042  */
1043
1044 STATIC void
1045 xlog_get_iclog_buffer_size(xfs_mount_t  *mp,
1046                            xlog_t       *log)
1047 {
1048         int size;
1049         int xhdrs;
1050
1051         if (mp->m_logbufs <= 0)
1052                 log->l_iclog_bufs = XLOG_MAX_ICLOGS;
1053         else
1054                 log->l_iclog_bufs = mp->m_logbufs;
1055
1056         /*
1057          * Buffer size passed in from mount system call.
1058          */
1059         if (mp->m_logbsize > 0) {
1060                 size = log->l_iclog_size = mp->m_logbsize;
1061                 log->l_iclog_size_log = 0;
1062                 while (size != 1) {
1063                         log->l_iclog_size_log++;
1064                         size >>= 1;
1065                 }
1066
1067                 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1068                         /* # headers = size / 32k
1069                          * one header holds cycles from 32k of data
1070                          */
1071
1072                         xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
1073                         if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
1074                                 xhdrs++;
1075                         log->l_iclog_hsize = xhdrs << BBSHIFT;
1076                         log->l_iclog_heads = xhdrs;
1077                 } else {
1078                         ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
1079                         log->l_iclog_hsize = BBSIZE;
1080                         log->l_iclog_heads = 1;
1081                 }
1082                 goto done;
1083         }
1084
1085         /* All machines use 32kB buffers by default. */
1086         log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1087         log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1088
1089         /* the default log size is 16k or 32k which is one header sector */
1090         log->l_iclog_hsize = BBSIZE;
1091         log->l_iclog_heads = 1;
1092
1093 done:
1094         /* are we being asked to make the sizes selected above visible? */
1095         if (mp->m_logbufs == 0)
1096                 mp->m_logbufs = log->l_iclog_bufs;
1097         if (mp->m_logbsize == 0)
1098                 mp->m_logbsize = log->l_iclog_size;
1099 }       /* xlog_get_iclog_buffer_size */
1100
1101
1102 /*
1103  * This routine initializes some of the log structure for a given mount point.
1104  * Its primary purpose is to fill in enough, so recovery can occur.  However,
1105  * some other stuff may be filled in too.
1106  */
1107 STATIC xlog_t *
1108 xlog_alloc_log(xfs_mount_t      *mp,
1109                xfs_buftarg_t    *log_target,
1110                xfs_daddr_t      blk_offset,
1111                int              num_bblks)
1112 {
1113         xlog_t                  *log;
1114         xlog_rec_header_t       *head;
1115         xlog_in_core_t          **iclogp;
1116         xlog_in_core_t          *iclog, *prev_iclog=NULL;
1117         xfs_buf_t               *bp;
1118         int                     i;
1119         int                     error = ENOMEM;
1120         uint                    log2_size = 0;
1121
1122         log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1123         if (!log) {
1124                 xfs_warn(mp, "Log allocation failed: No memory!");
1125                 goto out;
1126         }
1127
1128         log->l_mp          = mp;
1129         log->l_targ        = log_target;
1130         log->l_logsize     = BBTOB(num_bblks);
1131         log->l_logBBstart  = blk_offset;
1132         log->l_logBBsize   = num_bblks;
1133         log->l_covered_state = XLOG_STATE_COVER_IDLE;
1134         log->l_flags       |= XLOG_ACTIVE_RECOVERY;
1135
1136         log->l_prev_block  = -1;
1137         /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1138         xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1139         xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1140         log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
1141
1142         xlog_grant_head_init(&log->l_reserve_head);
1143         xlog_grant_head_init(&log->l_write_head);
1144
1145         error = EFSCORRUPTED;
1146         if (xfs_sb_version_hassector(&mp->m_sb)) {
1147                 log2_size = mp->m_sb.sb_logsectlog;
1148                 if (log2_size < BBSHIFT) {
1149                         xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1150                                 log2_size, BBSHIFT);
1151                         goto out_free_log;
1152                 }
1153
1154                 log2_size -= BBSHIFT;
1155                 if (log2_size > mp->m_sectbb_log) {
1156                         xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1157                                 log2_size, mp->m_sectbb_log);
1158                         goto out_free_log;
1159                 }
1160
1161                 /* for larger sector sizes, must have v2 or external log */
1162                 if (log2_size && log->l_logBBstart > 0 &&
1163                             !xfs_sb_version_haslogv2(&mp->m_sb)) {
1164                         xfs_warn(mp,
1165                 "log sector size (0x%x) invalid for configuration.",
1166                                 log2_size);
1167                         goto out_free_log;
1168                 }
1169         }
1170         log->l_sectBBsize = 1 << log2_size;
1171
1172         xlog_get_iclog_buffer_size(mp, log);
1173
1174         error = ENOMEM;
1175         bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
1176         if (!bp)
1177                 goto out_free_log;
1178         bp->b_iodone = xlog_iodone;
1179         ASSERT(xfs_buf_islocked(bp));
1180         log->l_xbuf = bp;
1181
1182         spin_lock_init(&log->l_icloglock);
1183         init_waitqueue_head(&log->l_flush_wait);
1184
1185         /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1186         ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1187
1188         iclogp = &log->l_iclog;
1189         /*
1190          * The amount of memory to allocate for the iclog structure is
1191          * rather funky due to the way the structure is defined.  It is
1192          * done this way so that we can use different sizes for machines
1193          * with different amounts of memory.  See the definition of
1194          * xlog_in_core_t in xfs_log_priv.h for details.
1195          */
1196         ASSERT(log->l_iclog_size >= 4096);
1197         for (i=0; i < log->l_iclog_bufs; i++) {
1198                 *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
1199                 if (!*iclogp)
1200                         goto out_free_iclog;
1201
1202                 iclog = *iclogp;
1203                 iclog->ic_prev = prev_iclog;
1204                 prev_iclog = iclog;
1205
1206                 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1207                                                 log->l_iclog_size, 0);
1208                 if (!bp)
1209                         goto out_free_iclog;
1210
1211                 bp->b_iodone = xlog_iodone;
1212                 iclog->ic_bp = bp;
1213                 iclog->ic_data = bp->b_addr;
1214 #ifdef DEBUG
1215                 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
1216 #endif
1217                 head = &iclog->ic_header;
1218                 memset(head, 0, sizeof(xlog_rec_header_t));
1219                 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1220                 head->h_version = cpu_to_be32(
1221                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1222                 head->h_size = cpu_to_be32(log->l_iclog_size);
1223                 /* new fields */
1224                 head->h_fmt = cpu_to_be32(XLOG_FMT);
1225                 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1226
1227                 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
1228                 iclog->ic_state = XLOG_STATE_ACTIVE;
1229                 iclog->ic_log = log;
1230                 atomic_set(&iclog->ic_refcnt, 0);
1231                 spin_lock_init(&iclog->ic_callback_lock);
1232                 iclog->ic_callback_tail = &(iclog->ic_callback);
1233                 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1234
1235                 ASSERT(xfs_buf_islocked(iclog->ic_bp));
1236                 init_waitqueue_head(&iclog->ic_force_wait);
1237                 init_waitqueue_head(&iclog->ic_write_wait);
1238
1239                 iclogp = &iclog->ic_next;
1240         }
1241         *iclogp = log->l_iclog;                 /* complete ring */
1242         log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
1243
1244         error = xlog_cil_init(log);
1245         if (error)
1246                 goto out_free_iclog;
1247         return log;
1248
1249 out_free_iclog:
1250         for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1251                 prev_iclog = iclog->ic_next;
1252                 if (iclog->ic_bp)
1253                         xfs_buf_free(iclog->ic_bp);
1254                 kmem_free(iclog);
1255         }
1256         spinlock_destroy(&log->l_icloglock);
1257         xfs_buf_free(log->l_xbuf);
1258 out_free_log:
1259         kmem_free(log);
1260 out:
1261         return ERR_PTR(-error);
1262 }       /* xlog_alloc_log */
1263
1264
1265 /*
1266  * Write out the commit record of a transaction associated with the given
1267  * ticket.  Return the lsn of the commit record.
1268  */
1269 STATIC int
1270 xlog_commit_record(
1271         struct log              *log,
1272         struct xlog_ticket      *ticket,
1273         struct xlog_in_core     **iclog,
1274         xfs_lsn_t               *commitlsnp)
1275 {
1276         struct xfs_mount *mp = log->l_mp;
1277         int     error;
1278         struct xfs_log_iovec reg = {
1279                 .i_addr = NULL,
1280                 .i_len = 0,
1281                 .i_type = XLOG_REG_TYPE_COMMIT,
1282         };
1283         struct xfs_log_vec vec = {
1284                 .lv_niovecs = 1,
1285                 .lv_iovecp = &reg,
1286         };
1287
1288         ASSERT_ALWAYS(iclog);
1289         error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1290                                         XLOG_COMMIT_TRANS);
1291         if (error)
1292                 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1293         return error;
1294 }
1295
1296 /*
1297  * Push on the buffer cache code if we ever use more than 75% of the on-disk
1298  * log space.  This code pushes on the lsn which would supposedly free up
1299  * the 25% which we want to leave free.  We may need to adopt a policy which
1300  * pushes on an lsn which is further along in the log once we reach the high
1301  * water mark.  In this manner, we would be creating a low water mark.
1302  */
1303 STATIC void
1304 xlog_grant_push_ail(
1305         struct log      *log,
1306         int             need_bytes)
1307 {
1308         xfs_lsn_t       threshold_lsn = 0;
1309         xfs_lsn_t       last_sync_lsn;
1310         int             free_blocks;
1311         int             free_bytes;
1312         int             threshold_block;
1313         int             threshold_cycle;
1314         int             free_threshold;
1315
1316         ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1317
1318         free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
1319         free_blocks = BTOBBT(free_bytes);
1320
1321         /*
1322          * Set the threshold for the minimum number of free blocks in the
1323          * log to the maximum of what the caller needs, one quarter of the
1324          * log, and 256 blocks.
1325          */
1326         free_threshold = BTOBB(need_bytes);
1327         free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1328         free_threshold = MAX(free_threshold, 256);
1329         if (free_blocks >= free_threshold)
1330                 return;
1331
1332         xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1333                                                 &threshold_block);
1334         threshold_block += free_threshold;
1335         if (threshold_block >= log->l_logBBsize) {
1336                 threshold_block -= log->l_logBBsize;
1337                 threshold_cycle += 1;
1338         }
1339         threshold_lsn = xlog_assign_lsn(threshold_cycle,
1340                                         threshold_block);
1341         /*
1342          * Don't pass in an lsn greater than the lsn of the last
1343          * log record known to be on disk. Use a snapshot of the last sync lsn
1344          * so that it doesn't change between the compare and the set.
1345          */
1346         last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1347         if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1348                 threshold_lsn = last_sync_lsn;
1349
1350         /*
1351          * Get the transaction layer to kick the dirty buffers out to
1352          * disk asynchronously. No point in trying to do this if
1353          * the filesystem is shutting down.
1354          */
1355         if (!XLOG_FORCED_SHUTDOWN(log))
1356                 xfs_ail_push(log->l_ailp, threshold_lsn);
1357 }
1358
1359 /*
1360  * The bdstrat callback function for log bufs. This gives us a central
1361  * place to trap bufs in case we get hit by a log I/O error and need to
1362  * shutdown. Actually, in practice, even when we didn't get a log error,
1363  * we transition the iclogs to IOERROR state *after* flushing all existing
1364  * iclogs to disk. This is because we don't want anymore new transactions to be
1365  * started or completed afterwards.
1366  */
1367 STATIC int
1368 xlog_bdstrat(
1369         struct xfs_buf          *bp)
1370 {
1371         struct xlog_in_core     *iclog = bp->b_fspriv;
1372
1373         if (iclog->ic_state & XLOG_STATE_IOERROR) {
1374                 xfs_buf_ioerror(bp, EIO);
1375                 xfs_buf_stale(bp);
1376                 xfs_buf_ioend(bp, 0);
1377                 /*
1378                  * It would seem logical to return EIO here, but we rely on
1379                  * the log state machine to propagate I/O errors instead of
1380                  * doing it here.
1381                  */
1382                 return 0;
1383         }
1384
1385         xfs_buf_iorequest(bp);
1386         return 0;
1387 }
1388
1389 /*
1390  * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
1391  * fashion.  Previously, we should have moved the current iclog
1392  * ptr in the log to point to the next available iclog.  This allows further
1393  * write to continue while this code syncs out an iclog ready to go.
1394  * Before an in-core log can be written out, the data section must be scanned
1395  * to save away the 1st word of each BBSIZE block into the header.  We replace
1396  * it with the current cycle count.  Each BBSIZE block is tagged with the
1397  * cycle count because there in an implicit assumption that drives will
1398  * guarantee that entire 512 byte blocks get written at once.  In other words,
1399  * we can't have part of a 512 byte block written and part not written.  By
1400  * tagging each block, we will know which blocks are valid when recovering
1401  * after an unclean shutdown.
1402  *
1403  * This routine is single threaded on the iclog.  No other thread can be in
1404  * this routine with the same iclog.  Changing contents of iclog can there-
1405  * fore be done without grabbing the state machine lock.  Updating the global
1406  * log will require grabbing the lock though.
1407  *
1408  * The entire log manager uses a logical block numbering scheme.  Only
1409  * log_sync (and then only bwrite()) know about the fact that the log may
1410  * not start with block zero on a given device.  The log block start offset
1411  * is added immediately before calling bwrite().
1412  */
1413
1414 STATIC int
1415 xlog_sync(xlog_t                *log,
1416           xlog_in_core_t        *iclog)
1417 {
1418         xfs_caddr_t     dptr;           /* pointer to byte sized element */
1419         xfs_buf_t       *bp;
1420         int             i;
1421         uint            count;          /* byte count of bwrite */
1422         uint            count_init;     /* initial count before roundup */
1423         int             roundoff;       /* roundoff to BB or stripe */
1424         int             split = 0;      /* split write into two regions */
1425         int             error;
1426         int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1427
1428         XFS_STATS_INC(xs_log_writes);
1429         ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
1430
1431         /* Add for LR header */
1432         count_init = log->l_iclog_hsize + iclog->ic_offset;
1433
1434         /* Round out the log write size */
1435         if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
1436                 /* we have a v2 stripe unit to use */
1437                 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
1438         } else {
1439                 count = BBTOB(BTOBB(count_init));
1440         }
1441         roundoff = count - count_init;
1442         ASSERT(roundoff >= 0);
1443         ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 
1444                 roundoff < log->l_mp->m_sb.sb_logsunit)
1445                 || 
1446                 (log->l_mp->m_sb.sb_logsunit <= 1 && 
1447                  roundoff < BBTOB(1)));
1448
1449         /* move grant heads by roundoff in sync */
1450         xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
1451         xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
1452
1453         /* put cycle number in every block */
1454         xlog_pack_data(log, iclog, roundoff); 
1455
1456         /* real byte length */
1457         if (v2) {
1458                 iclog->ic_header.h_len =
1459                         cpu_to_be32(iclog->ic_offset + roundoff);
1460         } else {
1461                 iclog->ic_header.h_len =
1462                         cpu_to_be32(iclog->ic_offset);
1463         }
1464
1465         bp = iclog->ic_bp;
1466         XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
1467
1468         XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
1469
1470         /* Do we need to split this write into 2 parts? */
1471         if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1472                 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1473                 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1474                 iclog->ic_bwritecnt = 2;        /* split into 2 writes */
1475         } else {
1476                 iclog->ic_bwritecnt = 1;
1477         }
1478         XFS_BUF_SET_COUNT(bp, count);
1479         bp->b_fspriv = iclog;
1480         XFS_BUF_ZEROFLAGS(bp);
1481         XFS_BUF_ASYNC(bp);
1482         bp->b_flags |= XBF_SYNCIO;
1483
1484         if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1485                 bp->b_flags |= XBF_FUA;
1486
1487                 /*
1488                  * Flush the data device before flushing the log to make
1489                  * sure all meta data written back from the AIL actually made
1490                  * it to disk before stamping the new log tail LSN into the
1491                  * log buffer.  For an external log we need to issue the
1492                  * flush explicitly, and unfortunately synchronously here;
1493                  * for an internal log we can simply use the block layer
1494                  * state machine for preflushes.
1495                  */
1496                 if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
1497                         xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
1498                 else
1499                         bp->b_flags |= XBF_FLUSH;
1500         }
1501
1502         ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1503         ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1504
1505         xlog_verify_iclog(log, iclog, count, B_TRUE);
1506
1507         /* account for log which doesn't start at block #0 */
1508         XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1509         /*
1510          * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
1511          * is shutting down.
1512          */
1513         XFS_BUF_WRITE(bp);
1514
1515         error = xlog_bdstrat(bp);
1516         if (error) {
1517                 xfs_buf_ioerror_alert(bp, "xlog_sync");
1518                 return error;
1519         }
1520         if (split) {
1521                 bp = iclog->ic_log->l_xbuf;
1522                 XFS_BUF_SET_ADDR(bp, 0);             /* logical 0 */
1523                 xfs_buf_associate_memory(bp,
1524                                 (char *)&iclog->ic_header + count, split);
1525                 bp->b_fspriv = iclog;
1526                 XFS_BUF_ZEROFLAGS(bp);
1527                 XFS_BUF_ASYNC(bp);
1528                 bp->b_flags |= XBF_SYNCIO;
1529                 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1530                         bp->b_flags |= XBF_FUA;
1531                 dptr = bp->b_addr;
1532                 /*
1533                  * Bump the cycle numbers at the start of each block
1534                  * since this part of the buffer is at the start of
1535                  * a new cycle.  Watch out for the header magic number
1536                  * case, though.
1537                  */
1538                 for (i = 0; i < split; i += BBSIZE) {
1539                         be32_add_cpu((__be32 *)dptr, 1);
1540                         if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
1541                                 be32_add_cpu((__be32 *)dptr, 1);
1542                         dptr += BBSIZE;
1543                 }
1544
1545                 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1546                 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1547
1548                 /* account for internal log which doesn't start at block #0 */
1549                 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1550                 XFS_BUF_WRITE(bp);
1551                 error = xlog_bdstrat(bp);
1552                 if (error) {
1553                         xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
1554                         return error;
1555                 }
1556         }
1557         return 0;
1558 }       /* xlog_sync */
1559
1560
1561 /*
1562  * Deallocate a log structure
1563  */
1564 STATIC void
1565 xlog_dealloc_log(xlog_t *log)
1566 {
1567         xlog_in_core_t  *iclog, *next_iclog;
1568         int             i;
1569
1570         xlog_cil_destroy(log);
1571
1572         /*
1573          * always need to ensure that the extra buffer does not point to memory
1574          * owned by another log buffer before we free it.
1575          */
1576         xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
1577         xfs_buf_free(log->l_xbuf);
1578
1579         iclog = log->l_iclog;
1580         for (i=0; i<log->l_iclog_bufs; i++) {
1581                 xfs_buf_free(iclog->ic_bp);
1582                 next_iclog = iclog->ic_next;
1583                 kmem_free(iclog);
1584                 iclog = next_iclog;
1585         }
1586         spinlock_destroy(&log->l_icloglock);
1587
1588         log->l_mp->m_log = NULL;
1589         kmem_free(log);
1590 }       /* xlog_dealloc_log */
1591
1592 /*
1593  * Update counters atomically now that memcpy is done.
1594  */
1595 /* ARGSUSED */
1596 static inline void
1597 xlog_state_finish_copy(xlog_t           *log,
1598                        xlog_in_core_t   *iclog,
1599                        int              record_cnt,
1600                        int              copy_bytes)
1601 {
1602         spin_lock(&log->l_icloglock);
1603
1604         be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
1605         iclog->ic_offset += copy_bytes;
1606
1607         spin_unlock(&log->l_icloglock);
1608 }       /* xlog_state_finish_copy */
1609
1610
1611
1612
1613 /*
1614  * print out info relating to regions written which consume
1615  * the reservation
1616  */
1617 void
1618 xlog_print_tic_res(
1619         struct xfs_mount        *mp,
1620         struct xlog_ticket      *ticket)
1621 {
1622         uint i;
1623         uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
1624
1625         /* match with XLOG_REG_TYPE_* in xfs_log.h */
1626         static char *res_type_str[XLOG_REG_TYPE_MAX] = {
1627             "bformat",
1628             "bchunk",
1629             "efi_format",
1630             "efd_format",
1631             "iformat",
1632             "icore",
1633             "iext",
1634             "ibroot",
1635             "ilocal",
1636             "iattr_ext",
1637             "iattr_broot",
1638             "iattr_local",
1639             "qformat",
1640             "dquot",
1641             "quotaoff",
1642             "LR header",
1643             "unmount",
1644             "commit",
1645             "trans header"
1646         };
1647         static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
1648             "SETATTR_NOT_SIZE",
1649             "SETATTR_SIZE",
1650             "INACTIVE",
1651             "CREATE",
1652             "CREATE_TRUNC",
1653             "TRUNCATE_FILE",
1654             "REMOVE",
1655             "LINK",
1656             "RENAME",
1657             "MKDIR",
1658             "RMDIR",
1659             "SYMLINK",
1660             "SET_DMATTRS",
1661             "GROWFS",
1662             "STRAT_WRITE",
1663             "DIOSTRAT",
1664             "WRITE_SYNC",
1665             "WRITEID",
1666             "ADDAFORK",
1667             "ATTRINVAL",
1668             "ATRUNCATE",
1669             "ATTR_SET",
1670             "ATTR_RM",
1671             "ATTR_FLAG",
1672             "CLEAR_AGI_BUCKET",
1673             "QM_SBCHANGE",
1674             "DUMMY1",
1675             "DUMMY2",
1676             "QM_QUOTAOFF",
1677             "QM_DQALLOC",
1678             "QM_SETQLIM",
1679             "QM_DQCLUSTER",
1680             "QM_QINOCREATE",
1681             "QM_QUOTAOFF_END",
1682             "SB_UNIT",
1683             "FSYNC_TS",
1684             "GROWFSRT_ALLOC",
1685             "GROWFSRT_ZERO",
1686             "GROWFSRT_FREE",
1687             "SWAPEXT"
1688         };
1689
1690         xfs_warn(mp,
1691                 "xlog_write: reservation summary:\n"
1692                 "  trans type  = %s (%u)\n"
1693                 "  unit res    = %d bytes\n"
1694                 "  current res = %d bytes\n"
1695                 "  total reg   = %u bytes (o/flow = %u bytes)\n"
1696                 "  ophdrs      = %u (ophdr space = %u bytes)\n"
1697                 "  ophdr + reg = %u bytes\n"
1698                 "  num regions = %u\n",
1699                 ((ticket->t_trans_type <= 0 ||
1700                   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
1701                   "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
1702                 ticket->t_trans_type,
1703                 ticket->t_unit_res,
1704                 ticket->t_curr_res,
1705                 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1706                 ticket->t_res_num_ophdrs, ophdr_spc,
1707                 ticket->t_res_arr_sum +
1708                 ticket->t_res_o_flow + ophdr_spc,
1709                 ticket->t_res_num);
1710
1711         for (i = 0; i < ticket->t_res_num; i++) {
1712                 uint r_type = ticket->t_res_arr[i].r_type;
1713                 xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
1714                             ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
1715                             "bad-rtype" : res_type_str[r_type-1]),
1716                             ticket->t_res_arr[i].r_len);
1717         }
1718
1719         xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1720                 "xlog_write: reservation ran out. Need to up reservation");
1721         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1722 }
1723
1724 /*
1725  * Calculate the potential space needed by the log vector.  Each region gets
1726  * its own xlog_op_header_t and may need to be double word aligned.
1727  */
1728 static int
1729 xlog_write_calc_vec_length(
1730         struct xlog_ticket      *ticket,
1731         struct xfs_log_vec      *log_vector)
1732 {
1733         struct xfs_log_vec      *lv;
1734         int                     headers = 0;
1735         int                     len = 0;
1736         int                     i;
1737
1738         /* acct for start rec of xact */
1739         if (ticket->t_flags & XLOG_TIC_INITED)
1740                 headers++;
1741
1742         for (lv = log_vector; lv; lv = lv->lv_next) {
1743                 headers += lv->lv_niovecs;
1744
1745                 for (i = 0; i < lv->lv_niovecs; i++) {
1746                         struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
1747
1748                         len += vecp->i_len;
1749                         xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
1750                 }
1751         }
1752
1753         ticket->t_res_num_ophdrs += headers;
1754         len += headers * sizeof(struct xlog_op_header);
1755
1756         return len;
1757 }
1758
1759 /*
1760  * If first write for transaction, insert start record  We can't be trying to
1761  * commit if we are inited.  We can't have any "partial_copy" if we are inited.
1762  */
1763 static int
1764 xlog_write_start_rec(
1765         struct xlog_op_header   *ophdr,
1766         struct xlog_ticket      *ticket)
1767 {
1768         if (!(ticket->t_flags & XLOG_TIC_INITED))
1769                 return 0;
1770
1771         ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
1772         ophdr->oh_clientid = ticket->t_clientid;
1773         ophdr->oh_len = 0;
1774         ophdr->oh_flags = XLOG_START_TRANS;
1775         ophdr->oh_res2 = 0;
1776
1777         ticket->t_flags &= ~XLOG_TIC_INITED;
1778
1779         return sizeof(struct xlog_op_header);
1780 }
1781
1782 static xlog_op_header_t *
1783 xlog_write_setup_ophdr(
1784         struct log              *log,
1785         struct xlog_op_header   *ophdr,
1786         struct xlog_ticket      *ticket,
1787         uint                    flags)
1788 {
1789         ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
1790         ophdr->oh_clientid = ticket->t_clientid;
1791         ophdr->oh_res2 = 0;
1792
1793         /* are we copying a commit or unmount record? */
1794         ophdr->oh_flags = flags;
1795
1796         /*
1797          * We've seen logs corrupted with bad transaction client ids.  This
1798          * makes sure that XFS doesn't generate them on.  Turn this into an EIO
1799          * and shut down the filesystem.
1800          */
1801         switch (ophdr->oh_clientid)  {
1802         case XFS_TRANSACTION:
1803         case XFS_VOLUME:
1804         case XFS_LOG:
1805                 break;
1806         default:
1807                 xfs_warn(log->l_mp,
1808                         "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1809                         ophdr->oh_clientid, ticket);
1810                 return NULL;
1811         }
1812
1813         return ophdr;
1814 }
1815
1816 /*
1817  * Set up the parameters of the region copy into the log. This has
1818  * to handle region write split across multiple log buffers - this
1819  * state is kept external to this function so that this code can
1820  * can be written in an obvious, self documenting manner.
1821  */
1822 static int
1823 xlog_write_setup_copy(
1824         struct xlog_ticket      *ticket,
1825         struct xlog_op_header   *ophdr,
1826         int                     space_available,
1827         int                     space_required,
1828         int                     *copy_off,
1829         int                     *copy_len,
1830         int                     *last_was_partial_copy,
1831         int                     *bytes_consumed)
1832 {
1833         int                     still_to_copy;
1834
1835         still_to_copy = space_required - *bytes_consumed;
1836         *copy_off = *bytes_consumed;
1837
1838         if (still_to_copy <= space_available) {
1839                 /* write of region completes here */
1840                 *copy_len = still_to_copy;
1841                 ophdr->oh_len = cpu_to_be32(*copy_len);
1842                 if (*last_was_partial_copy)
1843                         ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1844                 *last_was_partial_copy = 0;
1845                 *bytes_consumed = 0;
1846                 return 0;
1847         }
1848
1849         /* partial write of region, needs extra log op header reservation */
1850         *copy_len = space_available;
1851         ophdr->oh_len = cpu_to_be32(*copy_len);
1852         ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
1853         if (*last_was_partial_copy)
1854                 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
1855         *bytes_consumed += *copy_len;
1856         (*last_was_partial_copy)++;
1857
1858         /* account for new log op header */
1859         ticket->t_curr_res -= sizeof(struct xlog_op_header);
1860         ticket->t_res_num_ophdrs++;
1861
1862         return sizeof(struct xlog_op_header);
1863 }
1864
1865 static int
1866 xlog_write_copy_finish(
1867         struct log              *log,
1868         struct xlog_in_core     *iclog,
1869         uint                    flags,
1870         int                     *record_cnt,
1871         int                     *data_cnt,
1872         int                     *partial_copy,
1873         int                     *partial_copy_len,
1874         int                     log_offset,
1875         struct xlog_in_core     **commit_iclog)
1876 {
1877         if (*partial_copy) {
1878                 /*
1879                  * This iclog has already been marked WANT_SYNC by
1880                  * xlog_state_get_iclog_space.
1881                  */
1882                 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1883                 *record_cnt = 0;
1884                 *data_cnt = 0;
1885                 return xlog_state_release_iclog(log, iclog);
1886         }
1887
1888         *partial_copy = 0;
1889         *partial_copy_len = 0;
1890
1891         if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1892                 /* no more space in this iclog - push it. */
1893                 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
1894                 *record_cnt = 0;
1895                 *data_cnt = 0;
1896
1897                 spin_lock(&log->l_icloglock);
1898                 xlog_state_want_sync(log, iclog);
1899                 spin_unlock(&log->l_icloglock);
1900
1901                 if (!commit_iclog)
1902                         return xlog_state_release_iclog(log, iclog);
1903                 ASSERT(flags & XLOG_COMMIT_TRANS);
1904                 *commit_iclog = iclog;
1905         }
1906
1907         return 0;
1908 }
1909
1910 /*
1911  * Write some region out to in-core log
1912  *
1913  * This will be called when writing externally provided regions or when
1914  * writing out a commit record for a given transaction.
1915  *
1916  * General algorithm:
1917  *      1. Find total length of this write.  This may include adding to the
1918  *              lengths passed in.
1919  *      2. Check whether we violate the tickets reservation.
1920  *      3. While writing to this iclog
1921  *          A. Reserve as much space in this iclog as can get
1922  *          B. If this is first write, save away start lsn
1923  *          C. While writing this region:
1924  *              1. If first write of transaction, write start record
1925  *              2. Write log operation header (header per region)
1926  *              3. Find out if we can fit entire region into this iclog
1927  *              4. Potentially, verify destination memcpy ptr
1928  *              5. Memcpy (partial) region
1929  *              6. If partial copy, release iclog; otherwise, continue
1930  *                      copying more regions into current iclog
1931  *      4. Mark want sync bit (in simulation mode)
1932  *      5. Release iclog for potential flush to on-disk log.
1933  *
1934  * ERRORS:
1935  * 1.   Panic if reservation is overrun.  This should never happen since
1936  *      reservation amounts are generated internal to the filesystem.
1937  * NOTES:
1938  * 1. Tickets are single threaded data structures.
1939  * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
1940  *      syncing routine.  When a single log_write region needs to span
1941  *      multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
1942  *      on all log operation writes which don't contain the end of the
1943  *      region.  The XLOG_END_TRANS bit is used for the in-core log
1944  *      operation which contains the end of the continued log_write region.
1945  * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
1946  *      we don't really know exactly how much space will be used.  As a result,
1947  *      we don't update ic_offset until the end when we know exactly how many
1948  *      bytes have been written out.
1949  */
1950 int
1951 xlog_write(
1952         struct log              *log,
1953         struct xfs_log_vec      *log_vector,
1954         struct xlog_ticket      *ticket,
1955         xfs_lsn_t               *start_lsn,
1956         struct xlog_in_core     **commit_iclog,
1957         uint                    flags)
1958 {
1959         struct xlog_in_core     *iclog = NULL;
1960         struct xfs_log_iovec    *vecp;
1961         struct xfs_log_vec      *lv;
1962         int                     len;
1963         int                     index;
1964         int                     partial_copy = 0;
1965         int                     partial_copy_len = 0;
1966         int                     contwr = 0;
1967         int                     record_cnt = 0;
1968         int                     data_cnt = 0;
1969         int                     error;
1970
1971         *start_lsn = 0;
1972
1973         len = xlog_write_calc_vec_length(ticket, log_vector);
1974
1975         /*
1976          * Region headers and bytes are already accounted for.
1977          * We only need to take into account start records and
1978          * split regions in this function.
1979          */
1980         if (ticket->t_flags & XLOG_TIC_INITED)
1981                 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1982
1983         /*
1984          * Commit record headers need to be accounted for. These
1985          * come in as separate writes so are easy to detect.
1986          */
1987         if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1988                 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1989
1990         if (ticket->t_curr_res < 0)
1991                 xlog_print_tic_res(log->l_mp, ticket);
1992
1993         index = 0;
1994         lv = log_vector;
1995         vecp = lv->lv_iovecp;
1996         while (lv && index < lv->lv_niovecs) {
1997                 void            *ptr;
1998                 int             log_offset;
1999
2000                 error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
2001                                                    &contwr, &log_offset);
2002                 if (error)
2003                         return error;
2004
2005                 ASSERT(log_offset <= iclog->ic_size - 1);
2006                 ptr = iclog->ic_datap + log_offset;
2007
2008                 /* start_lsn is the first lsn written to. That's all we need. */
2009                 if (!*start_lsn)
2010                         *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2011
2012                 /*
2013                  * This loop writes out as many regions as can fit in the amount
2014                  * of space which was allocated by xlog_state_get_iclog_space().
2015                  */
2016                 while (lv && index < lv->lv_niovecs) {
2017                         struct xfs_log_iovec    *reg = &vecp[index];
2018                         struct xlog_op_header   *ophdr;
2019                         int                     start_rec_copy;
2020                         int                     copy_len;
2021                         int                     copy_off;
2022
2023                         ASSERT(reg->i_len % sizeof(__int32_t) == 0);
2024                         ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
2025
2026                         start_rec_copy = xlog_write_start_rec(ptr, ticket);
2027                         if (start_rec_copy) {
2028                                 record_cnt++;
2029                                 xlog_write_adv_cnt(&ptr, &len, &log_offset,
2030                                                    start_rec_copy);
2031                         }
2032
2033                         ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
2034                         if (!ophdr)
2035                                 return XFS_ERROR(EIO);
2036
2037                         xlog_write_adv_cnt(&ptr, &len, &log_offset,
2038                                            sizeof(struct xlog_op_header));
2039
2040                         len += xlog_write_setup_copy(ticket, ophdr,
2041                                                      iclog->ic_size-log_offset,
2042                                                      reg->i_len,
2043                                                      &copy_off, &copy_len,
2044                                                      &partial_copy,
2045                                                      &partial_copy_len);
2046                         xlog_verify_dest_ptr(log, ptr);
2047
2048                         /* copy region */
2049                         ASSERT(copy_len >= 0);
2050                         memcpy(ptr, reg->i_addr + copy_off, copy_len);
2051                         xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
2052
2053                         copy_len += start_rec_copy + sizeof(xlog_op_header_t);
2054                         record_cnt++;
2055                         data_cnt += contwr ? copy_len : 0;
2056
2057                         error = xlog_write_copy_finish(log, iclog, flags,
2058                                                        &record_cnt, &data_cnt,
2059                                                        &partial_copy,
2060                                                        &partial_copy_len,
2061                                                        log_offset,
2062                                                        commit_iclog);
2063                         if (error)
2064                                 return error;
2065
2066                         /*
2067                          * if we had a partial copy, we need to get more iclog
2068                          * space but we don't want to increment the region
2069                          * index because there is still more is this region to
2070                          * write.
2071                          *
2072                          * If we completed writing this region, and we flushed
2073                          * the iclog (indicated by resetting of the record
2074                          * count), then we also need to get more log space. If
2075                          * this was the last record, though, we are done and
2076                          * can just return.
2077                          */
2078                         if (partial_copy)
2079                                 break;
2080
2081                         if (++index == lv->lv_niovecs) {
2082                                 lv = lv->lv_next;
2083                                 index = 0;
2084                                 if (lv)
2085                                         vecp = lv->lv_iovecp;
2086                         }
2087                         if (record_cnt == 0) {
2088                                 if (!lv)
2089                                         return 0;
2090                                 break;
2091                         }
2092                 }
2093         }
2094
2095         ASSERT(len == 0);
2096
2097         xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2098         if (!commit_iclog)
2099                 return xlog_state_release_iclog(log, iclog);
2100
2101         ASSERT(flags & XLOG_COMMIT_TRANS);
2102         *commit_iclog = iclog;
2103         return 0;
2104 }
2105
2106
2107 /*****************************************************************************
2108  *
2109  *              State Machine functions
2110  *
2111  *****************************************************************************
2112  */
2113
2114 /* Clean iclogs starting from the head.  This ordering must be
2115  * maintained, so an iclog doesn't become ACTIVE beyond one that
2116  * is SYNCING.  This is also required to maintain the notion that we use
2117  * a ordered wait queue to hold off would be writers to the log when every
2118  * iclog is trying to sync to disk.
2119  *
2120  * State Change: DIRTY -> ACTIVE
2121  */
2122 STATIC void
2123 xlog_state_clean_log(xlog_t *log)
2124 {
2125         xlog_in_core_t  *iclog;
2126         int changed = 0;
2127
2128         iclog = log->l_iclog;
2129         do {
2130                 if (iclog->ic_state == XLOG_STATE_DIRTY) {
2131                         iclog->ic_state = XLOG_STATE_ACTIVE;
2132                         iclog->ic_offset       = 0;
2133                         ASSERT(iclog->ic_callback == NULL);
2134                         /*
2135                          * If the number of ops in this iclog indicate it just
2136                          * contains the dummy transaction, we can
2137                          * change state into IDLE (the second time around).
2138                          * Otherwise we should change the state into
2139                          * NEED a dummy.
2140                          * We don't need to cover the dummy.
2141                          */
2142                         if (!changed &&
2143                            (be32_to_cpu(iclog->ic_header.h_num_logops) ==
2144                                         XLOG_COVER_OPS)) {
2145                                 changed = 1;
2146                         } else {
2147                                 /*
2148                                  * We have two dirty iclogs so start over
2149                                  * This could also be num of ops indicates
2150                                  * this is not the dummy going out.
2151                                  */
2152                                 changed = 2;
2153                         }
2154                         iclog->ic_header.h_num_logops = 0;
2155                         memset(iclog->ic_header.h_cycle_data, 0,
2156                               sizeof(iclog->ic_header.h_cycle_data));
2157                         iclog->ic_header.h_lsn = 0;
2158                 } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
2159                         /* do nothing */;
2160                 else
2161                         break;  /* stop cleaning */
2162                 iclog = iclog->ic_next;
2163         } while (iclog != log->l_iclog);
2164
2165         /* log is locked when we are called */
2166         /*
2167          * Change state for the dummy log recording.
2168          * We usually go to NEED. But we go to NEED2 if the changed indicates
2169          * we are done writing the dummy record.
2170          * If we are done with the second dummy recored (DONE2), then
2171          * we go to IDLE.
2172          */
2173         if (changed) {
2174                 switch (log->l_covered_state) {
2175                 case XLOG_STATE_COVER_IDLE:
2176                 case XLOG_STATE_COVER_NEED:
2177                 case XLOG_STATE_COVER_NEED2:
2178                         log->l_covered_state = XLOG_STATE_COVER_NEED;
2179                         break;
2180
2181                 case XLOG_STATE_COVER_DONE:
2182                         if (changed == 1)
2183                                 log->l_covered_state = XLOG_STATE_COVER_NEED2;
2184                         else
2185                                 log->l_covered_state = XLOG_STATE_COVER_NEED;
2186                         break;
2187
2188                 case XLOG_STATE_COVER_DONE2:
2189                         if (changed == 1)
2190                                 log->l_covered_state = XLOG_STATE_COVER_IDLE;
2191                         else
2192                                 log->l_covered_state = XLOG_STATE_COVER_NEED;
2193                         break;
2194
2195                 default:
2196                         ASSERT(0);
2197                 }
2198         }
2199 }       /* xlog_state_clean_log */
2200
2201 STATIC xfs_lsn_t
2202 xlog_get_lowest_lsn(
2203         xlog_t          *log)
2204 {
2205         xlog_in_core_t  *lsn_log;
2206         xfs_lsn_t       lowest_lsn, lsn;
2207
2208         lsn_log = log->l_iclog;
2209         lowest_lsn = 0;
2210         do {
2211             if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
2212                 lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
2213                 if ((lsn && !lowest_lsn) ||
2214                     (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
2215                         lowest_lsn = lsn;
2216                 }
2217             }
2218             lsn_log = lsn_log->ic_next;
2219         } while (lsn_log != log->l_iclog);
2220         return lowest_lsn;
2221 }
2222
2223
2224 STATIC void
2225 xlog_state_do_callback(
2226         xlog_t          *log,
2227         int             aborted,
2228         xlog_in_core_t  *ciclog)
2229 {
2230         xlog_in_core_t     *iclog;
2231         xlog_in_core_t     *first_iclog;        /* used to know when we've
2232                                                  * processed all iclogs once */
2233         xfs_log_callback_t *cb, *cb_next;
2234         int                flushcnt = 0;
2235         xfs_lsn_t          lowest_lsn;
2236         int                ioerrors;    /* counter: iclogs with errors */
2237         int                loopdidcallbacks; /* flag: inner loop did callbacks*/
2238         int                funcdidcallbacks; /* flag: function did callbacks */
2239         int                repeats;     /* for issuing console warnings if
2240                                          * looping too many times */
2241         int                wake = 0;
2242
2243         spin_lock(&log->l_icloglock);
2244         first_iclog = iclog = log->l_iclog;
2245         ioerrors = 0;
2246         funcdidcallbacks = 0;
2247         repeats = 0;
2248
2249         do {
2250                 /*
2251                  * Scan all iclogs starting with the one pointed to by the
2252                  * log.  Reset this starting point each time the log is
2253                  * unlocked (during callbacks).
2254                  *
2255                  * Keep looping through iclogs until one full pass is made
2256                  * without running any callbacks.
2257                  */
2258                 first_iclog = log->l_iclog;
2259                 iclog = log->l_iclog;
2260                 loopdidcallbacks = 0;
2261                 repeats++;
2262
2263                 do {
2264
2265                         /* skip all iclogs in the ACTIVE & DIRTY states */
2266                         if (iclog->ic_state &
2267                             (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
2268                                 iclog = iclog->ic_next;
2269                                 continue;
2270                         }
2271
2272                         /*
2273                          * Between marking a filesystem SHUTDOWN and stopping
2274                          * the log, we do flush all iclogs to disk (if there
2275                          * wasn't a log I/O error). So, we do want things to
2276                          * go smoothly in case of just a SHUTDOWN  w/o a
2277                          * LOG_IO_ERROR.
2278                          */
2279                         if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2280                                 /*
2281                                  * Can only perform callbacks in order.  Since
2282                                  * this iclog is not in the DONE_SYNC/
2283                                  * DO_CALLBACK state, we skip the rest and
2284                                  * just try to clean up.  If we set our iclog
2285                                  * to DO_CALLBACK, we will not process it when
2286                                  * we retry since a previous iclog is in the
2287                                  * CALLBACK and the state cannot change since
2288                                  * we are holding the l_icloglock.
2289                                  */
2290                                 if (!(iclog->ic_state &
2291                                         (XLOG_STATE_DONE_SYNC |
2292                                                  XLOG_STATE_DO_CALLBACK))) {
2293                                         if (ciclog && (ciclog->ic_state ==
2294                                                         XLOG_STATE_DONE_SYNC)) {
2295                                                 ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2296                                         }
2297                                         break;
2298                                 }
2299                                 /*
2300                                  * We now have an iclog that is in either the
2301                                  * DO_CALLBACK or DONE_SYNC states. The other
2302                                  * states (WANT_SYNC, SYNCING, or CALLBACK were
2303                                  * caught by the above if and are going to
2304                                  * clean (i.e. we aren't doing their callbacks)
2305                                  * see the above if.
2306                                  */
2307
2308                                 /*
2309                                  * We will do one more check here to see if we
2310                                  * have chased our tail around.
2311                                  */
2312
2313                                 lowest_lsn = xlog_get_lowest_lsn(log);
2314                                 if (lowest_lsn &&
2315                                     XFS_LSN_CMP(lowest_lsn,
2316                                                 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2317                                         iclog = iclog->ic_next;
2318                                         continue; /* Leave this iclog for
2319                                                    * another thread */
2320                                 }
2321
2322                                 iclog->ic_state = XLOG_STATE_CALLBACK;
2323
2324
2325                                 /*
2326                                  * update the last_sync_lsn before we drop the
2327                                  * icloglock to ensure we are the only one that
2328                                  * can update it.
2329                                  */
2330                                 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2331                                         be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2332                                 atomic64_set(&log->l_last_sync_lsn,
2333                                         be64_to_cpu(iclog->ic_header.h_lsn));
2334
2335                         } else
2336                                 ioerrors++;
2337
2338                         spin_unlock(&log->l_icloglock);
2339
2340                         /*
2341                          * Keep processing entries in the callback list until
2342                          * we come around and it is empty.  We need to
2343                          * atomically see that the list is empty and change the
2344                          * state to DIRTY so that we don't miss any more
2345                          * callbacks being added.
2346                          */
2347                         spin_lock(&iclog->ic_callback_lock);
2348                         cb = iclog->ic_callback;
2349                         while (cb) {
2350                                 iclog->ic_callback_tail = &(iclog->ic_callback);
2351                                 iclog->ic_callback = NULL;
2352                                 spin_unlock(&iclog->ic_callback_lock);
2353
2354                                 /* perform callbacks in the order given */
2355                                 for (; cb; cb = cb_next) {
2356                                         cb_next = cb->cb_next;
2357                                         cb->cb_func(cb->cb_arg, aborted);
2358                                 }
2359                                 spin_lock(&iclog->ic_callback_lock);
2360                                 cb = iclog->ic_callback;
2361                         }
2362
2363                         loopdidcallbacks++;
2364                         funcdidcallbacks++;
2365
2366                         spin_lock(&log->l_icloglock);
2367                         ASSERT(iclog->ic_callback == NULL);
2368                         spin_unlock(&iclog->ic_callback_lock);
2369                         if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2370                                 iclog->ic_state = XLOG_STATE_DIRTY;
2371
2372                         /*
2373                          * Transition from DIRTY to ACTIVE if applicable.
2374                          * NOP if STATE_IOERROR.
2375                          */
2376                         xlog_state_clean_log(log);
2377
2378                         /* wake up threads waiting in xfs_log_force() */
2379                         wake_up_all(&iclog->ic_force_wait);
2380
2381                         iclog = iclog->ic_next;
2382                 } while (first_iclog != iclog);
2383
2384                 if (repeats > 5000) {
2385                         flushcnt += repeats;
2386                         repeats = 0;
2387                         xfs_warn(log->l_mp,
2388                                 "%s: possible infinite loop (%d iterations)",
2389                                 __func__, flushcnt);
2390                 }
2391         } while (!ioerrors && loopdidcallbacks);
2392
2393         /*
2394          * make one last gasp attempt to see if iclogs are being left in
2395          * limbo..
2396          */
2397 #ifdef DEBUG
2398         if (funcdidcallbacks) {
2399                 first_iclog = iclog = log->l_iclog;
2400                 do {
2401                         ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2402                         /*
2403                          * Terminate the loop if iclogs are found in states
2404                          * which will cause other threads to clean up iclogs.
2405                          *
2406                          * SYNCING - i/o completion will go through logs
2407                          * DONE_SYNC - interrupt thread should be waiting for
2408                          *              l_icloglock
2409                          * IOERROR - give up hope all ye who enter here
2410                          */
2411                         if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2412                             iclog->ic_state == XLOG_STATE_SYNCING ||
2413                             iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2414                             iclog->ic_state == XLOG_STATE_IOERROR )
2415                                 break;
2416                         iclog = iclog->ic_next;
2417                 } while (first_iclog != iclog);
2418         }
2419 #endif
2420
2421         if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
2422                 wake = 1;
2423         spin_unlock(&log->l_icloglock);
2424
2425         if (wake)
2426                 wake_up_all(&log->l_flush_wait);
2427 }
2428
2429
2430 /*
2431  * Finish transitioning this iclog to the dirty state.
2432  *
2433  * Make sure that we completely execute this routine only when this is
2434  * the last call to the iclog.  There is a good chance that iclog flushes,
2435  * when we reach the end of the physical log, get turned into 2 separate
2436  * calls to bwrite.  Hence, one iclog flush could generate two calls to this
2437  * routine.  By using the reference count bwritecnt, we guarantee that only
2438  * the second completion goes through.
2439  *
2440  * Callbacks could take time, so they are done outside the scope of the
2441  * global state machine log lock.
2442  */
2443 STATIC void
2444 xlog_state_done_syncing(
2445         xlog_in_core_t  *iclog,
2446         int             aborted)
2447 {
2448         xlog_t             *log = iclog->ic_log;
2449
2450         spin_lock(&log->l_icloglock);
2451
2452         ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
2453                iclog->ic_state == XLOG_STATE_IOERROR);
2454         ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
2455         ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
2456
2457
2458         /*
2459          * If we got an error, either on the first buffer, or in the case of
2460          * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
2461          * and none should ever be attempted to be written to disk
2462          * again.
2463          */
2464         if (iclog->ic_state != XLOG_STATE_IOERROR) {
2465                 if (--iclog->ic_bwritecnt == 1) {
2466                         spin_unlock(&log->l_icloglock);
2467                         return;
2468                 }
2469                 iclog->ic_state = XLOG_STATE_DONE_SYNC;
2470         }
2471
2472         /*
2473          * Someone could be sleeping prior to writing out the next
2474          * iclog buffer, we wake them all, one will get to do the
2475          * I/O, the others get to wait for the result.
2476          */
2477         wake_up_all(&iclog->ic_write_wait);
2478         spin_unlock(&log->l_icloglock);
2479         xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
2480 }       /* xlog_state_done_syncing */
2481
2482
2483 /*
2484  * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2485  * sleep.  We wait on the flush queue on the head iclog as that should be
2486  * the first iclog to complete flushing. Hence if all iclogs are syncing,
2487  * we will wait here and all new writes will sleep until a sync completes.
2488  *
2489  * The in-core logs are used in a circular fashion. They are not used
2490  * out-of-order even when an iclog past the head is free.
2491  *
2492  * return:
2493  *      * log_offset where xlog_write() can start writing into the in-core
2494  *              log's data space.
2495  *      * in-core log pointer to which xlog_write() should write.
2496  *      * boolean indicating this is a continued write to an in-core log.
2497  *              If this is the last write, then the in-core log's offset field
2498  *              needs to be incremented, depending on the amount of data which
2499  *              is copied.
2500  */
2501 STATIC int
2502 xlog_state_get_iclog_space(xlog_t         *log,
2503                            int            len,
2504                            xlog_in_core_t **iclogp,
2505                            xlog_ticket_t  *ticket,
2506                            int            *continued_write,
2507                            int            *logoffsetp)
2508 {
2509         int               log_offset;
2510         xlog_rec_header_t *head;
2511         xlog_in_core_t    *iclog;
2512         int               error;
2513
2514 restart:
2515         spin_lock(&log->l_icloglock);
2516         if (XLOG_FORCED_SHUTDOWN(log)) {
2517                 spin_unlock(&log->l_icloglock);
2518                 return XFS_ERROR(EIO);
2519         }
2520
2521         iclog = log->l_iclog;
2522         if (iclog->ic_state != XLOG_STATE_ACTIVE) {
2523                 XFS_STATS_INC(xs_log_noiclogs);
2524
2525                 /* Wait for log writes to have flushed */
2526                 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2527                 goto restart;
2528         }
2529
2530         head = &iclog->ic_header;
2531
2532         atomic_inc(&iclog->ic_refcnt);  /* prevents sync */
2533         log_offset = iclog->ic_offset;
2534
2535         /* On the 1st write to an iclog, figure out lsn.  This works
2536          * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
2537          * committing to.  If the offset is set, that's how many blocks
2538          * must be written.
2539          */
2540         if (log_offset == 0) {
2541                 ticket->t_curr_res -= log->l_iclog_hsize;
2542                 xlog_tic_add_region(ticket,
2543                                     log->l_iclog_hsize,
2544                                     XLOG_REG_TYPE_LRHEADER);
2545                 head->h_cycle = cpu_to_be32(log->l_curr_cycle);
2546                 head->h_lsn = cpu_to_be64(
2547                         xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
2548                 ASSERT(log->l_curr_block >= 0);
2549         }
2550
2551         /* If there is enough room to write everything, then do it.  Otherwise,
2552          * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
2553          * bit is on, so this will get flushed out.  Don't update ic_offset
2554          * until you know exactly how many bytes get copied.  Therefore, wait
2555          * until later to update ic_offset.
2556          *
2557          * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
2558          * can fit into remaining data section.
2559          */
2560         if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
2561                 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2562
2563                 /*
2564                  * If I'm the only one writing to this iclog, sync it to disk.
2565                  * We need to do an atomic compare and decrement here to avoid
2566                  * racing with concurrent atomic_dec_and_lock() calls in
2567                  * xlog_state_release_iclog() when there is more than one
2568                  * reference to the iclog.
2569                  */
2570                 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
2571                         /* we are the only one */
2572                         spin_unlock(&log->l_icloglock);
2573                         error = xlog_state_release_iclog(log, iclog);
2574                         if (error)
2575                                 return error;
2576                 } else {
2577                         spin_unlock(&log->l_icloglock);
2578                 }
2579                 goto restart;
2580         }
2581
2582         /* Do we have enough room to write the full amount in the remainder
2583          * of this iclog?  Or must we continue a write on the next iclog and
2584          * mark this iclog as completely taken?  In the case where we switch
2585          * iclogs (to mark it taken), this particular iclog will release/sync
2586          * to disk in xlog_write().
2587          */
2588         if (len <= iclog->ic_size - iclog->ic_offset) {
2589                 *continued_write = 0;
2590                 iclog->ic_offset += len;
2591         } else {
2592                 *continued_write = 1;
2593                 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2594         }
2595         *iclogp = iclog;
2596
2597         ASSERT(iclog->ic_offset <= iclog->ic_size);
2598         spin_unlock(&log->l_icloglock);
2599
2600         *logoffsetp = log_offset;
2601         return 0;
2602 }       /* xlog_state_get_iclog_space */
2603
2604 /* The first cnt-1 times through here we don't need to
2605  * move the grant write head because the permanent
2606  * reservation has reserved cnt times the unit amount.
2607  * Release part of current permanent unit reservation and
2608  * reset current reservation to be one units worth.  Also
2609  * move grant reservation head forward.
2610  */
2611 STATIC void
2612 xlog_regrant_reserve_log_space(xlog_t        *log,
2613                                xlog_ticket_t *ticket)
2614 {
2615         trace_xfs_log_regrant_reserve_enter(log, ticket);
2616
2617         if (ticket->t_cnt > 0)
2618                 ticket->t_cnt--;
2619
2620         xlog_grant_sub_space(log, &log->l_reserve_head.grant,
2621                                         ticket->t_curr_res);
2622         xlog_grant_sub_space(log, &log->l_write_head.grant,
2623                                         ticket->t_curr_res);
2624         ticket->t_curr_res = ticket->t_unit_res;
2625         xlog_tic_reset_res(ticket);
2626
2627         trace_xfs_log_regrant_reserve_sub(log, ticket);
2628
2629         /* just return if we still have some of the pre-reserved space */
2630         if (ticket->t_cnt > 0)
2631                 return;
2632
2633         xlog_grant_add_space(log, &log->l_reserve_head.grant,
2634                                         ticket->t_unit_res);
2635
2636         trace_xfs_log_regrant_reserve_exit(log, ticket);
2637
2638         ticket->t_curr_res = ticket->t_unit_res;
2639         xlog_tic_reset_res(ticket);
2640 }       /* xlog_regrant_reserve_log_space */
2641
2642
2643 /*
2644  * Give back the space left from a reservation.
2645  *
2646  * All the information we need to make a correct determination of space left
2647  * is present.  For non-permanent reservations, things are quite easy.  The
2648  * count should have been decremented to zero.  We only need to deal with the
2649  * space remaining in the current reservation part of the ticket.  If the
2650  * ticket contains a permanent reservation, there may be left over space which
2651  * needs to be released.  A count of N means that N-1 refills of the current
2652  * reservation can be done before we need to ask for more space.  The first
2653  * one goes to fill up the first current reservation.  Once we run out of
2654  * space, the count will stay at zero and the only space remaining will be
2655  * in the current reservation field.
2656  */
2657 STATIC void
2658 xlog_ungrant_log_space(xlog_t        *log,
2659                        xlog_ticket_t *ticket)
2660 {
2661         int     bytes;
2662
2663         if (ticket->t_cnt > 0)
2664                 ticket->t_cnt--;
2665
2666         trace_xfs_log_ungrant_enter(log, ticket);
2667         trace_xfs_log_ungrant_sub(log, ticket);
2668
2669         /*
2670          * If this is a permanent reservation ticket, we may be able to free
2671          * up more space based on the remaining count.
2672          */
2673         bytes = ticket->t_curr_res;
2674         if (ticket->t_cnt > 0) {
2675                 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2676                 bytes += ticket->t_unit_res*ticket->t_cnt;
2677         }
2678
2679         xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
2680         xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
2681
2682         trace_xfs_log_ungrant_exit(log, ticket);
2683
2684         xfs_log_space_wake(log->l_mp);
2685 }
2686
2687 /*
2688  * Flush iclog to disk if this is the last reference to the given iclog and
2689  * the WANT_SYNC bit is set.
2690  *
2691  * When this function is entered, the iclog is not necessarily in the
2692  * WANT_SYNC state.  It may be sitting around waiting to get filled.
2693  *
2694  *
2695  */
2696 STATIC int
2697 xlog_state_release_iclog(
2698         xlog_t          *log,
2699         xlog_in_core_t  *iclog)
2700 {
2701         int             sync = 0;       /* do we sync? */
2702
2703         if (iclog->ic_state & XLOG_STATE_IOERROR)
2704                 return XFS_ERROR(EIO);
2705
2706         ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
2707         if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
2708                 return 0;
2709
2710         if (iclog->ic_state & XLOG_STATE_IOERROR) {
2711                 spin_unlock(&log->l_icloglock);
2712                 return XFS_ERROR(EIO);
2713         }
2714         ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
2715                iclog->ic_state == XLOG_STATE_WANT_SYNC);
2716
2717         if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2718                 /* update tail before writing to iclog */
2719                 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2720                 sync++;
2721                 iclog->ic_state = XLOG_STATE_SYNCING;
2722                 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2723                 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2724                 /* cycle incremented when incrementing curr_block */
2725         }
2726         spin_unlock(&log->l_icloglock);
2727
2728         /*
2729          * We let the log lock go, so it's possible that we hit a log I/O
2730          * error or some other SHUTDOWN condition that marks the iclog
2731          * as XLOG_STATE_IOERROR before the bwrite. However, we know that
2732          * this iclog has consistent data, so we ignore IOERROR
2733          * flags after this point.
2734          */
2735         if (sync)
2736                 return xlog_sync(log, iclog);
2737         return 0;
2738 }       /* xlog_state_release_iclog */
2739
2740
2741 /*
2742  * This routine will mark the current iclog in the ring as WANT_SYNC
2743  * and move the current iclog pointer to the next iclog in the ring.
2744  * When this routine is called from xlog_state_get_iclog_space(), the
2745  * exact size of the iclog has not yet been determined.  All we know is
2746  * that every data block.  We have run out of space in this log record.
2747  */
2748 STATIC void
2749 xlog_state_switch_iclogs(xlog_t         *log,
2750                          xlog_in_core_t *iclog,
2751                          int            eventual_size)
2752 {
2753         ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
2754         if (!eventual_size)
2755                 eventual_size = iclog->ic_offset;
2756         iclog->ic_state = XLOG_STATE_WANT_SYNC;
2757         iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
2758         log->l_prev_block = log->l_curr_block;
2759         log->l_prev_cycle = log->l_curr_cycle;
2760
2761         /* roll log?: ic_offset changed later */
2762         log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
2763
2764         /* Round up to next log-sunit */
2765         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
2766             log->l_mp->m_sb.sb_logsunit > 1) {
2767                 __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
2768                 log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
2769         }
2770
2771         if (log->l_curr_block >= log->l_logBBsize) {
2772                 log->l_curr_cycle++;
2773                 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
2774                         log->l_curr_cycle++;
2775                 log->l_curr_block -= log->l_logBBsize;
2776                 ASSERT(log->l_curr_block >= 0);
2777         }
2778         ASSERT(iclog == log->l_iclog);
2779         log->l_iclog = iclog->ic_next;
2780 }       /* xlog_state_switch_iclogs */
2781
2782 /*
2783  * Write out all data in the in-core log as of this exact moment in time.
2784  *
2785  * Data may be written to the in-core log during this call.  However,
2786  * we don't guarantee this data will be written out.  A change from past
2787  * implementation means this routine will *not* write out zero length LRs.
2788  *
2789  * Basically, we try and perform an intelligent scan of the in-core logs.
2790  * If we determine there is no flushable data, we just return.  There is no
2791  * flushable data if:
2792  *
2793  *      1. the current iclog is active and has no data; the previous iclog
2794  *              is in the active or dirty state.
2795  *      2. the current iclog is drity, and the previous iclog is in the
2796  *              active or dirty state.
2797  *
2798  * We may sleep if:
2799  *
2800  *      1. the current iclog is not in the active nor dirty state.
2801  *      2. the current iclog dirty, and the previous iclog is not in the
2802  *              active nor dirty state.
2803  *      3. the current iclog is active, and there is another thread writing
2804  *              to this particular iclog.
2805  *      4. a) the current iclog is active and has no other writers
2806  *         b) when we return from flushing out this iclog, it is still
2807  *              not in the active nor dirty state.
2808  */
2809 int
2810 _xfs_log_force(
2811         struct xfs_mount        *mp,
2812         uint                    flags,
2813         int                     *log_flushed)
2814 {
2815         struct log              *log = mp->m_log;
2816         struct xlog_in_core     *iclog;
2817         xfs_lsn_t               lsn;
2818
2819         XFS_STATS_INC(xs_log_force);
2820
2821         xlog_cil_force(log);
2822
2823         spin_lock(&log->l_icloglock);
2824
2825         iclog = log->l_iclog;
2826         if (iclog->ic_state & XLOG_STATE_IOERROR) {
2827                 spin_unlock(&log->l_icloglock);
2828                 return XFS_ERROR(EIO);
2829         }
2830
2831         /* If the head iclog is not active nor dirty, we just attach
2832          * ourselves to the head and go to sleep.
2833          */
2834         if (iclog->ic_state == XLOG_STATE_ACTIVE ||
2835             iclog->ic_state == XLOG_STATE_DIRTY) {
2836                 /*
2837                  * If the head is dirty or (active and empty), then
2838                  * we need to look at the previous iclog.  If the previous
2839                  * iclog is active or dirty we are done.  There is nothing
2840                  * to sync out.  Otherwise, we attach ourselves to the
2841                  * previous iclog and go to sleep.
2842                  */
2843                 if (iclog->ic_state == XLOG_STATE_DIRTY ||
2844                     (atomic_read(&iclog->ic_refcnt) == 0
2845                      && iclog->ic_offset == 0)) {
2846                         iclog = iclog->ic_prev;
2847                         if (iclog->ic_state == XLOG_STATE_ACTIVE ||
2848                             iclog->ic_state == XLOG_STATE_DIRTY)
2849                                 goto no_sleep;
2850                         else
2851                                 goto maybe_sleep;
2852                 } else {
2853                         if (atomic_read(&iclog->ic_refcnt) == 0) {
2854                                 /* We are the only one with access to this
2855                                  * iclog.  Flush it out now.  There should
2856                                  * be a roundoff of zero to show that someone
2857                                  * has already taken care of the roundoff from
2858                                  * the previous sync.
2859                                  */
2860                                 atomic_inc(&iclog->ic_refcnt);
2861                                 lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2862                                 xlog_state_switch_iclogs(log, iclog, 0);
2863                                 spin_unlock(&log->l_icloglock);
2864
2865                                 if (xlog_state_release_iclog(log, iclog))
2866                                         return XFS_ERROR(EIO);
2867
2868                                 if (log_flushed)
2869                                         *log_flushed = 1;
2870                                 spin_lock(&log->l_icloglock);
2871                                 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
2872                                     iclog->ic_state != XLOG_STATE_DIRTY)
2873                                         goto maybe_sleep;
2874                                 else
2875                                         goto no_sleep;
2876                         } else {
2877                                 /* Someone else is writing to this iclog.
2878                                  * Use its call to flush out the data.  However,
2879                                  * the other thread may not force out this LR,
2880                                  * so we mark it WANT_SYNC.
2881                                  */
2882                                 xlog_state_switch_iclogs(log, iclog, 0);
2883                                 goto maybe_sleep;
2884                         }
2885                 }
2886         }
2887
2888         /* By the time we come around again, the iclog could've been filled
2889          * which would give it another lsn.  If we have a new lsn, just
2890          * return because the relevant data has been flushed.
2891          */
2892 maybe_sleep:
2893         if (flags & XFS_LOG_SYNC) {
2894                 /*
2895                  * We must check if we're shutting down here, before
2896                  * we wait, while we're holding the l_icloglock.
2897                  * Then we check again after waking up, in case our
2898                  * sleep was disturbed by a bad news.
2899                  */
2900                 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2901                         spin_unlock(&log->l_icloglock);
2902                         return XFS_ERROR(EIO);
2903                 }
2904                 XFS_STATS_INC(xs_log_force_sleep);
2905                 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
2906                 /*
2907                  * No need to grab the log lock here since we're
2908                  * only deciding whether or not to return EIO
2909                  * and the memory read should be atomic.
2910                  */
2911                 if (iclog->ic_state & XLOG_STATE_IOERROR)
2912                         return XFS_ERROR(EIO);
2913                 if (log_flushed)
2914                         *log_flushed = 1;
2915         } else {
2916
2917 no_sleep:
2918                 spin_unlock(&log->l_icloglock);
2919         }
2920         return 0;
2921 }
2922
2923 /*
2924  * Wrapper for _xfs_log_force(), to be used when caller doesn't care
2925  * about errors or whether the log was flushed or not. This is the normal
2926  * interface to use when trying to unpin items or move the log forward.
2927  */
2928 void
2929 xfs_log_force(
2930         xfs_mount_t     *mp,
2931         uint            flags)
2932 {
2933         int     error;
2934
2935         error = _xfs_log_force(mp, flags, NULL);
2936         if (error)
2937                 xfs_warn(mp, "%s: error %d returned.", __func__, error);
2938 }
2939
2940 /*
2941  * Force the in-core log to disk for a specific LSN.
2942  *
2943  * Find in-core log with lsn.
2944  *      If it is in the DIRTY state, just return.
2945  *      If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
2946  *              state and go to sleep or return.
2947  *      If it is in any other state, go to sleep or return.
2948  *
2949  * Synchronous forces are implemented with a signal variable. All callers
2950  * to force a given lsn to disk will wait on a the sv attached to the
2951  * specific in-core log.  When given in-core log finally completes its
2952  * write to disk, that thread will wake up all threads waiting on the
2953  * sv.
2954  */
2955 int
2956 _xfs_log_force_lsn(
2957         struct xfs_mount        *mp,
2958         xfs_lsn_t               lsn,
2959         uint                    flags,
2960         int                     *log_flushed)
2961 {
2962         struct log              *log = mp->m_log;
2963         struct xlog_in_core     *iclog;
2964         int                     already_slept = 0;
2965
2966         ASSERT(lsn != 0);
2967
2968         XFS_STATS_INC(xs_log_force);
2969
2970         lsn = xlog_cil_force_lsn(log, lsn);
2971         if (lsn == NULLCOMMITLSN)
2972                 return 0;
2973
2974 try_again:
2975         spin_lock(&log->l_icloglock);
2976         iclog = log->l_iclog;
2977         if (iclog->ic_state & XLOG_STATE_IOERROR) {
2978                 spin_unlock(&log->l_icloglock);
2979                 return XFS_ERROR(EIO);
2980         }
2981
2982         do {
2983                 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
2984                         iclog = iclog->ic_next;
2985                         continue;
2986                 }
2987
2988                 if (iclog->ic_state == XLOG_STATE_DIRTY) {
2989                         spin_unlock(&log->l_icloglock);
2990                         return 0;
2991                 }
2992
2993                 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
2994                         /*
2995                          * We sleep here if we haven't already slept (e.g.
2996                          * this is the first time we've looked at the correct
2997                          * iclog buf) and the buffer before us is going to
2998                          * be sync'ed. The reason for this is that if we
2999                          * are doing sync transactions here, by waiting for
3000                          * the previous I/O to complete, we can allow a few
3001                          * more transactions into this iclog before we close
3002                          * it down.
3003                          *
3004                          * Otherwise, we mark the buffer WANT_SYNC, and bump
3005                          * up the refcnt so we can release the log (which
3006                          * drops the ref count).  The state switch keeps new
3007                          * transaction commits from using this buffer.  When
3008                          * the current commits finish writing into the buffer,
3009                          * the refcount will drop to zero and the buffer will
3010                          * go out then.
3011                          */
3012                         if (!already_slept &&
3013                             (iclog->ic_prev->ic_state &
3014                              (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3015                                 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3016
3017                                 XFS_STATS_INC(xs_log_force_sleep);
3018
3019                                 xlog_wait(&iclog->ic_prev->ic_write_wait,
3020                                                         &log->l_icloglock);
3021                                 if (log_flushed)
3022                                         *log_flushed = 1;
3023                                 already_slept = 1;
3024                                 goto try_again;
3025                         }
3026                         atomic_inc(&iclog->ic_refcnt);
3027                         xlog_state_switch_iclogs(log, iclog, 0);
3028                         spin_unlock(&log->l_icloglock);
3029                         if (xlog_state_release_iclog(log, iclog))
3030                                 return XFS_ERROR(EIO);
3031                         if (log_flushed)
3032                                 *log_flushed = 1;
3033                         spin_lock(&log->l_icloglock);
3034                 }
3035
3036                 if ((flags & XFS_LOG_SYNC) && /* sleep */
3037                     !(iclog->ic_state &
3038                       (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3039                         /*
3040                          * Don't wait on completion if we know that we've
3041                          * gotten a log write error.
3042                          */
3043                         if (iclog->ic_state & XLOG_STATE_IOERROR) {
3044                                 spin_unlock(&log->l_icloglock);
3045                                 return XFS_ERROR(EIO);
3046                         }
3047                         XFS_STATS_INC(xs_log_force_sleep);
3048                         xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3049                         /*
3050                          * No need to grab the log lock here since we're
3051                          * only deciding whether or not to return EIO
3052                          * and the memory read should be atomic.
3053                          */
3054                         if (iclog->ic_state & XLOG_STATE_IOERROR)
3055                                 return XFS_ERROR(EIO);
3056
3057                         if (log_flushed)
3058                                 *log_flushed = 1;
3059                 } else {                /* just return */
3060                         spin_unlock(&log->l_icloglock);
3061                 }
3062
3063                 return 0;
3064         } while (iclog != log->l_iclog);
3065
3066         spin_unlock(&log->l_icloglock);
3067         return 0;
3068 }
3069
3070 /*
3071  * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3072  * about errors or whether the log was flushed or not. This is the normal
3073  * interface to use when trying to unpin items or move the log forward.
3074  */
3075 void
3076 xfs_log_force_lsn(
3077         xfs_mount_t     *mp,
3078         xfs_lsn_t       lsn,
3079         uint            flags)
3080 {
3081         int     error;
3082
3083         error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3084         if (error)
3085                 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3086 }
3087
3088 /*
3089  * Called when we want to mark the current iclog as being ready to sync to
3090  * disk.
3091  */
3092 STATIC void
3093 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3094 {
3095         assert_spin_locked(&log->l_icloglock);
3096
3097         if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3098                 xlog_state_switch_iclogs(log, iclog, 0);
3099         } else {
3100                 ASSERT(iclog->ic_state &
3101                         (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
3102         }
3103 }
3104
3105
3106 /*****************************************************************************
3107  *
3108  *              TICKET functions
3109  *
3110  *****************************************************************************
3111  */
3112
3113 /*
3114  * Free a used ticket when its refcount falls to zero.
3115  */
3116 void
3117 xfs_log_ticket_put(
3118         xlog_ticket_t   *ticket)
3119 {
3120         ASSERT(atomic_read(&ticket->t_ref) > 0);
3121         if (atomic_dec_and_test(&ticket->t_ref))
3122                 kmem_zone_free(xfs_log_ticket_zone, ticket);
3123 }
3124
3125 xlog_ticket_t *
3126 xfs_log_ticket_get(
3127         xlog_ticket_t   *ticket)
3128 {
3129         ASSERT(atomic_read(&ticket->t_ref) > 0);
3130         atomic_inc(&ticket->t_ref);
3131         return ticket;
3132 }
3133
3134 /*
3135  * Allocate and initialise a new log ticket.
3136  */
3137 xlog_ticket_t *
3138 xlog_ticket_alloc(
3139         struct log      *log,
3140         int             unit_bytes,
3141         int             cnt,
3142         char            client,
3143         bool            permanent,
3144         int             alloc_flags)
3145 {
3146         struct xlog_ticket *tic;
3147         uint            num_headers;
3148         int             iclog_space;
3149
3150         tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3151         if (!tic)
3152                 return NULL;
3153
3154         /*
3155          * Permanent reservations have up to 'cnt'-1 active log operations
3156          * in the log.  A unit in this case is the amount of space for one
3157          * of these log operations.  Normal reservations have a cnt of 1
3158          * and their unit amount is the total amount of space required.
3159          *
3160          * The following lines of code account for non-transaction data
3161          * which occupy space in the on-disk log.
3162          *
3163          * Normal form of a transaction is:
3164          * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
3165          * and then there are LR hdrs, split-recs and roundoff at end of syncs.
3166          *
3167          * We need to account for all the leadup data and trailer data
3168          * around the transaction data.
3169          * And then we need to account for the worst case in terms of using
3170          * more space.
3171          * The worst case will happen if:
3172          * - the placement of the transaction happens to be such that the
3173          *   roundoff is at its maximum
3174          * - the transaction data is synced before the commit record is synced
3175          *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
3176          *   Therefore the commit record is in its own Log Record.
3177          *   This can happen as the commit record is called with its
3178          *   own region to xlog_write().
3179          *   This then means that in the worst case, roundoff can happen for
3180          *   the commit-rec as well.
3181          *   The commit-rec is smaller than padding in this scenario and so it is
3182          *   not added separately.
3183          */
3184
3185         /* for trans header */
3186         unit_bytes += sizeof(xlog_op_header_t);
3187         unit_bytes += sizeof(xfs_trans_header_t);
3188
3189         /* for start-rec */
3190         unit_bytes += sizeof(xlog_op_header_t);
3191
3192         /*
3193          * for LR headers - the space for data in an iclog is the size minus
3194          * the space used for the headers. If we use the iclog size, then we
3195          * undercalculate the number of headers required.
3196          *
3197          * Furthermore - the addition of op headers for split-recs might
3198          * increase the space required enough to require more log and op
3199          * headers, so take that into account too.
3200          *
3201          * IMPORTANT: This reservation makes the assumption that if this
3202          * transaction is the first in an iclog and hence has the LR headers
3203          * accounted to it, then the remaining space in the iclog is
3204          * exclusively for this transaction.  i.e. if the transaction is larger
3205          * than the iclog, it will be the only thing in that iclog.
3206          * Fundamentally, this means we must pass the entire log vector to
3207          * xlog_write to guarantee this.
3208          */
3209         iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3210         num_headers = howmany(unit_bytes, iclog_space);
3211
3212         /* for split-recs - ophdrs added when data split over LRs */
3213         unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3214
3215         /* add extra header reservations if we overrun */
3216         while (!num_headers ||
3217                howmany(unit_bytes, iclog_space) > num_headers) {
3218                 unit_bytes += sizeof(xlog_op_header_t);
3219                 num_headers++;
3220         }
3221         unit_bytes += log->l_iclog_hsize * num_headers;
3222
3223         /* for commit-rec LR header - note: padding will subsume the ophdr */
3224         unit_bytes += log->l_iclog_hsize;
3225
3226         /* for roundoff padding for transaction data and one for commit record */
3227         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3228             log->l_mp->m_sb.sb_logsunit > 1) {
3229                 /* log su roundoff */
3230                 unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
3231         } else {
3232                 /* BB roundoff */
3233                 unit_bytes += 2*BBSIZE;
3234         }
3235
3236         atomic_set(&tic->t_ref, 1);
3237         tic->t_task             = current;
3238         INIT_LIST_HEAD(&tic->t_queue);
3239         tic->t_unit_res         = unit_bytes;
3240         tic->t_curr_res         = unit_bytes;
3241         tic->t_cnt              = cnt;
3242         tic->t_ocnt             = cnt;
3243         tic->t_tid              = random32();
3244         tic->t_clientid         = client;
3245         tic->t_flags            = XLOG_TIC_INITED;
3246         tic->t_trans_type       = 0;
3247         if (permanent)
3248                 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3249
3250         xlog_tic_reset_res(tic);
3251
3252         return tic;
3253 }
3254
3255
3256 /******************************************************************************
3257  *
3258  *              Log debug routines
3259  *
3260  ******************************************************************************
3261  */
3262 #if defined(DEBUG)
3263 /*
3264  * Make sure that the destination ptr is within the valid data region of
3265  * one of the iclogs.  This uses backup pointers stored in a different
3266  * part of the log in case we trash the log structure.
3267  */
3268 void
3269 xlog_verify_dest_ptr(
3270         struct log      *log,
3271         char            *ptr)
3272 {
3273         int i;
3274         int good_ptr = 0;
3275
3276         for (i = 0; i < log->l_iclog_bufs; i++) {
3277                 if (ptr >= log->l_iclog_bak[i] &&
3278                     ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3279                         good_ptr++;
3280         }
3281
3282         if (!good_ptr)
3283                 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3284 }
3285
3286 /*
3287  * Check to make sure the grant write head didn't just over lap the tail.  If
3288  * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
3289  * the cycles differ by exactly one and check the byte count.
3290  *
3291  * This check is run unlocked, so can give false positives. Rather than assert
3292  * on failures, use a warn-once flag and a panic tag to allow the admin to
3293  * determine if they want to panic the machine when such an error occurs. For
3294  * debug kernels this will have the same effect as using an assert but, unlinke
3295  * an assert, it can be turned off at runtime.
3296  */
3297 STATIC void
3298 xlog_verify_grant_tail(
3299         struct log      *log)
3300 {
3301         int             tail_cycle, tail_blocks;
3302         int             cycle, space;
3303
3304         xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
3305         xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3306         if (tail_cycle != cycle) {
3307                 if (cycle - 1 != tail_cycle &&
3308                     !(log->l_flags & XLOG_TAIL_WARN)) {
3309                         xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3310                                 "%s: cycle - 1 != tail_cycle", __func__);
3311                         log->l_flags |= XLOG_TAIL_WARN;
3312                 }
3313
3314                 if (space > BBTOB(tail_blocks) &&
3315                     !(log->l_flags & XLOG_TAIL_WARN)) {
3316                         xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3317                                 "%s: space > BBTOB(tail_blocks)", __func__);
3318                         log->l_flags |= XLOG_TAIL_WARN;
3319                 }
3320         }
3321 }
3322
3323 /* check if it will fit */
3324 STATIC void
3325 xlog_verify_tail_lsn(xlog_t         *log,
3326                      xlog_in_core_t *iclog,
3327                      xfs_lsn_t      tail_lsn)
3328 {
3329     int blocks;
3330
3331     if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
3332         blocks =
3333             log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3334         if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3335                 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3336     } else {
3337         ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3338
3339         if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3340                 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3341
3342         blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3343         if (blocks < BTOBB(iclog->ic_offset) + 1)
3344                 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3345     }
3346 }       /* xlog_verify_tail_lsn */
3347
3348 /*
3349  * Perform a number of checks on the iclog before writing to disk.
3350  *
3351  * 1. Make sure the iclogs are still circular
3352  * 2. Make sure we have a good magic number
3353  * 3. Make sure we don't have magic numbers in the data
3354  * 4. Check fields of each log operation header for:
3355  *      A. Valid client identifier
3356  *      B. tid ptr value falls in valid ptr space (user space code)
3357  *      C. Length in log record header is correct according to the
3358  *              individual operation headers within record.
3359  * 5. When a bwrite will occur within 5 blocks of the front of the physical
3360  *      log, check the preceding blocks of the physical log to make sure all
3361  *      the cycle numbers agree with the current cycle number.
3362  */
3363 STATIC void
3364 xlog_verify_iclog(xlog_t         *log,
3365                   xlog_in_core_t *iclog,
3366                   int            count,
3367                   boolean_t      syncing)
3368 {
3369         xlog_op_header_t        *ophead;
3370         xlog_in_core_t          *icptr;
3371         xlog_in_core_2_t        *xhdr;
3372         xfs_caddr_t             ptr;
3373         xfs_caddr_t             base_ptr;
3374         __psint_t               field_offset;
3375         __uint8_t               clientid;
3376         int                     len, i, j, k, op_len;
3377         int                     idx;
3378
3379         /* check validity of iclog pointers */
3380         spin_lock(&log->l_icloglock);
3381         icptr = log->l_iclog;
3382         for (i=0; i < log->l_iclog_bufs; i++) {
3383                 if (icptr == NULL)
3384                         xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3385                 icptr = icptr->ic_next;
3386         }
3387         if (icptr != log->l_iclog)
3388                 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3389         spin_unlock(&log->l_icloglock);
3390
3391         /* check log magic numbers */
3392         if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3393                 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3394
3395         ptr = (xfs_caddr_t) &iclog->ic_header;
3396         for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
3397              ptr += BBSIZE) {
3398                 if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3399                         xfs_emerg(log->l_mp, "%s: unexpected magic num",
3400                                 __func__);
3401         }
3402
3403         /* check fields */
3404         len = be32_to_cpu(iclog->ic_header.h_num_logops);
3405         ptr = iclog->ic_datap;
3406         base_ptr = ptr;
3407         ophead = (xlog_op_header_t *)ptr;
3408         xhdr = iclog->ic_data;
3409         for (i = 0; i < len; i++) {
3410                 ophead = (xlog_op_header_t *)ptr;
3411
3412                 /* clientid is only 1 byte */
3413                 field_offset = (__psint_t)
3414                                ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
3415                 if (syncing == B_FALSE || (field_offset & 0x1ff)) {
3416                         clientid = ophead->oh_clientid;
3417                 } else {
3418                         idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
3419                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3420                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3421                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3422                                 clientid = xlog_get_client_id(
3423                                         xhdr[j].hic_xheader.xh_cycle_data[k]);
3424                         } else {
3425                                 clientid = xlog_get_client_id(
3426                                         iclog->ic_header.h_cycle_data[idx]);
3427                         }
3428                 }
3429                 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3430                         xfs_warn(log->l_mp,
3431                                 "%s: invalid clientid %d op 0x%p offset 0x%lx",
3432                                 __func__, clientid, ophead,
3433                                 (unsigned long)field_offset);
3434
3435                 /* check length */
3436                 field_offset = (__psint_t)
3437                                ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
3438                 if (syncing == B_FALSE || (field_offset & 0x1ff)) {
3439                         op_len = be32_to_cpu(ophead->oh_len);
3440                 } else {
3441                         idx = BTOBBT((__psint_t)&ophead->oh_len -
3442                                     (__psint_t)iclog->ic_datap);
3443                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3444                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3445                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3446                                 op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
3447                         } else {
3448                                 op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
3449                         }
3450                 }
3451                 ptr += sizeof(xlog_op_header_t) + op_len;
3452         }
3453 }       /* xlog_verify_iclog */
3454 #endif
3455
3456 /*
3457  * Mark all iclogs IOERROR. l_icloglock is held by the caller.
3458  */
3459 STATIC int
3460 xlog_state_ioerror(
3461         xlog_t  *log)
3462 {
3463         xlog_in_core_t  *iclog, *ic;
3464
3465         iclog = log->l_iclog;
3466         if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
3467                 /*
3468                  * Mark all the incore logs IOERROR.
3469                  * From now on, no log flushes will result.
3470                  */
3471                 ic = iclog;
3472                 do {
3473                         ic->ic_state = XLOG_STATE_IOERROR;
3474                         ic = ic->ic_next;
3475                 } while (ic != iclog);
3476                 return 0;
3477         }
3478         /*
3479          * Return non-zero, if state transition has already happened.
3480          */
3481         return 1;
3482 }
3483
3484 /*
3485  * This is called from xfs_force_shutdown, when we're forcibly
3486  * shutting down the filesystem, typically because of an IO error.
3487  * Our main objectives here are to make sure that:
3488  *      a. the filesystem gets marked 'SHUTDOWN' for all interested
3489  *         parties to find out, 'atomically'.
3490  *      b. those who're sleeping on log reservations, pinned objects and
3491  *          other resources get woken up, and be told the bad news.
3492  *      c. nothing new gets queued up after (a) and (b) are done.
3493  *      d. if !logerror, flush the iclogs to disk, then seal them off
3494  *         for business.
3495  *
3496  * Note: for delayed logging the !logerror case needs to flush the regions
3497  * held in memory out to the iclogs before flushing them to disk. This needs
3498  * to be done before the log is marked as shutdown, otherwise the flush to the
3499  * iclogs will fail.
3500  */
3501 int
3502 xfs_log_force_umount(
3503         struct xfs_mount        *mp,
3504         int                     logerror)
3505 {
3506         xlog_t          *log;
3507         int             retval;
3508
3509         log = mp->m_log;
3510
3511         /*
3512          * If this happens during log recovery, don't worry about
3513          * locking; the log isn't open for business yet.
3514          */
3515         if (!log ||
3516             log->l_flags & XLOG_ACTIVE_RECOVERY) {
3517                 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3518                 if (mp->m_sb_bp)
3519                         XFS_BUF_DONE(mp->m_sb_bp);
3520                 return 0;
3521         }
3522
3523         /*
3524          * Somebody could've already done the hard work for us.
3525          * No need to get locks for this.
3526          */
3527         if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
3528                 ASSERT(XLOG_FORCED_SHUTDOWN(log));
3529                 return 1;
3530         }
3531         retval = 0;
3532
3533         /*
3534          * Flush the in memory commit item list before marking the log as
3535          * being shut down. We need to do it in this order to ensure all the
3536          * completed transactions are flushed to disk with the xfs_log_force()
3537          * call below.
3538          */
3539         if (!logerror)
3540                 xlog_cil_force(log);
3541
3542         /*
3543          * mark the filesystem and the as in a shutdown state and wake
3544          * everybody up to tell them the bad news.
3545          */
3546         spin_lock(&log->l_icloglock);
3547         mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3548         if (mp->m_sb_bp)
3549                 XFS_BUF_DONE(mp->m_sb_bp);
3550
3551         /*
3552          * This flag is sort of redundant because of the mount flag, but
3553          * it's good to maintain the separation between the log and the rest
3554          * of XFS.
3555          */
3556         log->l_flags |= XLOG_IO_ERROR;
3557
3558         /*
3559          * If we hit a log error, we want to mark all the iclogs IOERROR
3560          * while we're still holding the loglock.
3561          */
3562         if (logerror)
3563                 retval = xlog_state_ioerror(log);
3564         spin_unlock(&log->l_icloglock);
3565
3566         /*
3567          * We don't want anybody waiting for log reservations after this. That
3568          * means we have to wake up everybody queued up on reserveq as well as
3569          * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
3570          * we don't enqueue anything once the SHUTDOWN flag is set, and this
3571          * action is protected by the grant locks.
3572          */
3573         xlog_grant_head_wake_all(&log->l_reserve_head);
3574         xlog_grant_head_wake_all(&log->l_write_head);
3575
3576         if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3577                 ASSERT(!logerror);
3578                 /*
3579                  * Force the incore logs to disk before shutting the
3580                  * log down completely.
3581                  */
3582                 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3583
3584                 spin_lock(&log->l_icloglock);
3585                 retval = xlog_state_ioerror(log);
3586                 spin_unlock(&log->l_icloglock);
3587         }
3588         /*
3589          * Wake up everybody waiting on xfs_log_force.
3590          * Callback all log item committed functions as if the
3591          * log writes were completed.
3592          */
3593         xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
3594
3595 #ifdef XFSERRORDEBUG
3596         {
3597                 xlog_in_core_t  *iclog;
3598
3599                 spin_lock(&log->l_icloglock);
3600                 iclog = log->l_iclog;
3601                 do {
3602                         ASSERT(iclog->ic_callback == 0);
3603                         iclog = iclog->ic_next;
3604                 } while (iclog != log->l_iclog);
3605                 spin_unlock(&log->l_icloglock);
3606         }
3607 #endif
3608         /* return non-zero if log IOERROR transition had already happened */
3609         return retval;
3610 }
3611
3612 STATIC int
3613 xlog_iclogs_empty(xlog_t *log)
3614 {
3615         xlog_in_core_t  *iclog;
3616
3617         iclog = log->l_iclog;
3618         do {
3619                 /* endianness does not matter here, zero is zero in
3620                  * any language.
3621                  */
3622                 if (iclog->ic_header.h_num_logops)
3623                         return 0;
3624                 iclog = iclog->ic_next;
3625         } while (iclog != log->l_iclog);
3626         return 1;
3627 }