c8c59a5b1a8fc292f079a8f6d379b09d2a8ae83b
[karo-tx-linux.git] / fs / nfs / blocklayout / extent_tree.c
1 /*
2  * Copyright (c) 2014 Christoph Hellwig.
3  */
4
5 #include "blocklayout.h"
6
7 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
8
9 static inline struct pnfs_block_extent *
10 ext_node(struct rb_node *node)
11 {
12         return rb_entry(node, struct pnfs_block_extent, be_node);
13 }
14
15 static struct pnfs_block_extent *
16 ext_tree_first(struct rb_root *root)
17 {
18         struct rb_node *node = rb_first(root);
19         return node ? ext_node(node) : NULL;
20 }
21
22 static struct pnfs_block_extent *
23 ext_tree_prev(struct pnfs_block_extent *be)
24 {
25         struct rb_node *node = rb_prev(&be->be_node);
26         return node ? ext_node(node) : NULL;
27 }
28
29 static struct pnfs_block_extent *
30 ext_tree_next(struct pnfs_block_extent *be)
31 {
32         struct rb_node *node = rb_next(&be->be_node);
33         return node ? ext_node(node) : NULL;
34 }
35
36 static inline sector_t
37 ext_f_end(struct pnfs_block_extent *be)
38 {
39         return be->be_f_offset + be->be_length;
40 }
41
42 static struct pnfs_block_extent *
43 __ext_tree_search(struct rb_root *root, sector_t start)
44 {
45         struct rb_node *node = root->rb_node;
46         struct pnfs_block_extent *be = NULL;
47
48         while (node) {
49                 be = ext_node(node);
50                 if (start < be->be_f_offset)
51                         node = node->rb_left;
52                 else if (start >= ext_f_end(be))
53                         node = node->rb_right;
54                 else
55                         return be;
56         }
57
58         if (be) {
59                 if (start < be->be_f_offset)
60                         return be;
61
62                 if (start >= ext_f_end(be))
63                         return ext_tree_next(be);
64         }
65
66         return NULL;
67 }
68
69 static bool
70 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
71 {
72         if (be1->be_state != be2->be_state)
73                 return false;
74         if (be1->be_mdev != be2->be_mdev)
75                 return false;
76
77         if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
78                 return false;
79
80         if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
81             (be1->be_v_offset + be1->be_length != be2->be_v_offset))
82                 return false;
83
84         if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
85             be1->be_tag != be2->be_tag)
86                 return false;
87
88         return true;
89 }
90
91 static struct pnfs_block_extent *
92 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
93 {
94         struct pnfs_block_extent *left = ext_tree_prev(be);
95
96         if (left && ext_can_merge(left, be)) {
97                 left->be_length += be->be_length;
98                 rb_erase(&be->be_node, root);
99                 kfree(be);
100                 return left;
101         }
102
103         return be;
104 }
105
106 static struct pnfs_block_extent *
107 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
108 {
109         struct pnfs_block_extent *right = ext_tree_next(be);
110
111         if (right && ext_can_merge(be, right)) {
112                 be->be_length += right->be_length;
113                 rb_erase(&right->be_node, root);
114                 kfree(right);
115         }
116
117         return be;
118 }
119
120 static void
121 __ext_tree_insert(struct rb_root *root,
122                 struct pnfs_block_extent *new, bool merge_ok)
123 {
124         struct rb_node **p = &root->rb_node, *parent = NULL;
125         struct pnfs_block_extent *be;
126
127         while (*p) {
128                 parent = *p;
129                 be = ext_node(parent);
130
131                 if (new->be_f_offset < be->be_f_offset) {
132                         if (merge_ok && ext_can_merge(new, be)) {
133                                 be->be_f_offset = new->be_f_offset;
134                                 if (be->be_state != PNFS_BLOCK_NONE_DATA)
135                                         be->be_v_offset = new->be_v_offset;
136                                 be->be_length += new->be_length;
137                                 be = ext_try_to_merge_left(root, be);
138                                 kfree(new);
139                                 return;
140                         }
141                         p = &(*p)->rb_left;
142                 } else if (new->be_f_offset >= ext_f_end(be)) {
143                         if (merge_ok && ext_can_merge(be, new)) {
144                                 be->be_length += new->be_length;
145                                 be = ext_try_to_merge_right(root, be);
146                                 kfree(new);
147                                 return;
148                         }
149                         p = &(*p)->rb_right;
150                 } else {
151                         BUG();
152                 }
153         }
154
155         rb_link_node(&new->be_node, parent, p);
156         rb_insert_color(&new->be_node, root);
157 }
158
159 static int
160 __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
161 {
162         struct pnfs_block_extent *be;
163         sector_t len1 = 0, len2 = 0;
164         sector_t orig_f_offset;
165         sector_t orig_v_offset;
166         sector_t orig_len;
167
168         be = __ext_tree_search(root, start);
169         if (!be)
170                 return 0;
171         if (be->be_f_offset >= end)
172                 return 0;
173
174         orig_f_offset = be->be_f_offset;
175         orig_v_offset = be->be_v_offset;
176         orig_len = be->be_length;
177
178         if (start > be->be_f_offset)
179                 len1 = start - be->be_f_offset;
180         if (ext_f_end(be) > end)
181                 len2 = ext_f_end(be) - end;
182
183         if (len2 > 0) {
184                 if (len1 > 0) {
185                         struct pnfs_block_extent *new;
186
187                         new = kzalloc(sizeof(*new), GFP_ATOMIC);
188                         if (!new)
189                                 return -ENOMEM;
190
191                         be->be_length = len1;
192
193                         new->be_f_offset = end;
194                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
195                                 new->be_v_offset =
196                                         orig_v_offset + orig_len - len2;
197                         }
198                         new->be_length = len2;
199                         new->be_state = be->be_state;
200                         new->be_tag = be->be_tag;
201                         new->be_mdev = be->be_mdev;
202                         memcpy(&new->be_devid, &be->be_devid,
203                                 sizeof(struct nfs4_deviceid));
204
205                         __ext_tree_insert(root, new, true);
206                 } else {
207                         be->be_f_offset = end;
208                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
209                                 be->be_v_offset =
210                                         orig_v_offset + orig_len - len2;
211                         }
212                         be->be_length = len2;
213                 }
214         } else {
215                 if (len1 > 0) {
216                         be->be_length = len1;
217                         be = ext_tree_next(be);
218                 }
219
220                 while (be && ext_f_end(be) <= end) {
221                         struct pnfs_block_extent *next = ext_tree_next(be);
222
223                         rb_erase(&be->be_node, root);
224                         kfree(be);
225                         be = next;
226                 }
227
228                 if (be && be->be_f_offset < end) {
229                         len1 = ext_f_end(be) - end;
230                         be->be_f_offset = end;
231                         if (be->be_state != PNFS_BLOCK_NONE_DATA)
232                                 be->be_v_offset += be->be_length - len1;
233                         be->be_length = len1;
234                 }
235         }
236
237         return 0;
238 }
239
240 int
241 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
242 {
243         struct pnfs_block_extent *be;
244         struct rb_root *root;
245         int err = 0;
246
247         switch (new->be_state) {
248         case PNFS_BLOCK_READWRITE_DATA:
249         case PNFS_BLOCK_INVALID_DATA:
250                 root = &bl->bl_ext_rw;
251                 break;
252         case PNFS_BLOCK_READ_DATA:
253         case PNFS_BLOCK_NONE_DATA:
254                 root = &bl->bl_ext_ro;
255                 break;
256         default:
257                 dprintk("invalid extent type\n");
258                 return -EINVAL;
259         }
260
261         spin_lock(&bl->bl_ext_lock);
262 retry:
263         be = __ext_tree_search(root, new->be_f_offset);
264         if (!be || be->be_f_offset >= ext_f_end(new)) {
265                 __ext_tree_insert(root, new, true);
266         } else if (new->be_f_offset >= be->be_f_offset) {
267                 if (ext_f_end(new) <= ext_f_end(be)) {
268                         kfree(new);
269                 } else {
270                         sector_t new_len = ext_f_end(new) - ext_f_end(be);
271                         sector_t diff = new->be_length - new_len;
272
273                         new->be_f_offset += diff;
274                         new->be_v_offset += diff;
275                         new->be_length = new_len;
276                         goto retry;
277                 }
278         } else if (ext_f_end(new) <= ext_f_end(be)) {
279                 new->be_length = be->be_f_offset - new->be_f_offset;
280                 __ext_tree_insert(root, new, true);
281         } else {
282                 struct pnfs_block_extent *split;
283                 sector_t new_len = ext_f_end(new) - ext_f_end(be);
284                 sector_t diff = new->be_length - new_len;
285
286                 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
287                 if (!split) {
288                         err = -EINVAL;
289                         goto out;
290                 }
291
292                 split->be_length = be->be_f_offset - split->be_f_offset;
293                 __ext_tree_insert(root, split, true);
294
295                 new->be_f_offset += diff;
296                 new->be_v_offset += diff;
297                 new->be_length = new_len;
298                 goto retry;
299         }
300 out:
301         spin_unlock(&bl->bl_ext_lock);
302         return err;
303 }
304
305 static bool
306 __ext_tree_lookup(struct rb_root *root, sector_t isect,
307                 struct pnfs_block_extent *ret)
308 {
309         struct rb_node *node;
310         struct pnfs_block_extent *be;
311
312         node = root->rb_node;
313         while (node) {
314                 be = ext_node(node);
315                 if (isect < be->be_f_offset)
316                         node = node->rb_left;
317                 else if (isect >= ext_f_end(be))
318                         node = node->rb_right;
319                 else {
320                         *ret = *be;
321                         return true;
322                 }
323         }
324
325         return false;
326 }
327
328 bool
329 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
330             struct pnfs_block_extent *ret, bool rw)
331 {
332         bool found = false;
333
334         spin_lock(&bl->bl_ext_lock);
335         if (!rw)
336                 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
337         if (!found)
338                 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
339         spin_unlock(&bl->bl_ext_lock);
340
341         return found;
342 }
343
344 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
345                 sector_t start, sector_t end)
346 {
347         int err, err2;
348
349         spin_lock(&bl->bl_ext_lock);
350         err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
351         if (rw) {
352                 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
353                 if (!err)
354                         err = err2;
355         }
356         spin_unlock(&bl->bl_ext_lock);
357
358         return err;
359 }
360
361 static int
362 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
363                 sector_t split)
364 {
365         struct pnfs_block_extent *new;
366         sector_t orig_len = be->be_length;
367
368         dprintk("%s: need split for 0x%lx:0x%lx at 0x%lx\n",
369                 __func__, be->be_f_offset, ext_f_end(be), split);
370
371         new = kzalloc(sizeof(*new), GFP_ATOMIC);
372         if (!new)
373                 return -ENOMEM;
374
375         be->be_length = split - be->be_f_offset;
376
377         new->be_f_offset = split;
378         if (be->be_state != PNFS_BLOCK_NONE_DATA)
379                 new->be_v_offset = be->be_v_offset + be->be_length;
380         new->be_length = orig_len - be->be_length;
381         new->be_state = be->be_state;
382         new->be_tag = be->be_tag;
383
384         new->be_mdev = be->be_mdev;
385         memcpy(&new->be_devid, &be->be_devid, sizeof(struct nfs4_deviceid));
386
387         dprintk("%s: got 0x%lx:0x%lx!\n",
388                 __func__, be->be_f_offset, ext_f_end(be));
389         dprintk("%s: got 0x%lx:0x%lx!\n",
390                 __func__, new->be_f_offset, ext_f_end(new));
391
392         __ext_tree_insert(root, new, false);
393         return 0;
394 }
395
396 int
397 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
398                 sector_t len)
399 {
400         struct rb_root *root = &bl->bl_ext_rw;
401         sector_t end = start + len;
402         struct pnfs_block_extent *be;
403         int err = 0;
404
405         spin_lock(&bl->bl_ext_lock);
406         /*
407          * First remove all COW extents or holes from written to range.
408          */
409         err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
410         if (err)
411                 goto out;
412
413         /*
414          * Then mark all invalid extents in the range as written to.
415          */
416         for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
417                 if (be->be_f_offset >= end)
418                         break;
419
420                 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
421                         continue;
422
423                 if (be->be_f_offset < start) {
424                         struct pnfs_block_extent *left = ext_tree_prev(be);
425
426                         if (left && ext_can_merge(left, be)) {
427                                 sector_t diff = start - be->be_f_offset;
428
429                                 left->be_length += diff;
430
431                                 be->be_f_offset += diff;
432                                 be->be_v_offset += diff;
433                                 be->be_length -= diff;
434                         } else {
435                                 err = ext_tree_split(root, be, start);
436                                 if (err)
437                                         goto out;
438                         }
439                 }
440
441                 if (ext_f_end(be) > end) {
442                         struct pnfs_block_extent *right = ext_tree_next(be);
443
444                         if (right && ext_can_merge(be, right)) {
445                                 sector_t diff = end - be->be_f_offset;
446
447                                 be->be_length -= diff;
448
449                                 right->be_f_offset -= diff;
450                                 right->be_v_offset -= diff;
451                                 right->be_length += diff;
452                         } else {
453                                 err = ext_tree_split(root, be, end);
454                                 if (err)
455                                         goto out;
456                         }
457                 }
458
459                 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
460                         be->be_tag = EXTENT_WRITTEN;
461                         be = ext_try_to_merge_left(root, be);
462                         be = ext_try_to_merge_right(root, be);
463                 }
464         }
465 out:
466         spin_unlock(&bl->bl_ext_lock);
467         return err;
468 }
469
470 int
471 ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
472 {
473         struct pnfs_block_extent *be;
474         unsigned int count = 0;
475         __be32 *p, *xdr_start;
476         int ret = 0;
477
478         dprintk("%s enter\n", __func__);
479
480         xdr_start = xdr_reserve_space(xdr, 8);
481         if (!xdr_start)
482                 return -ENOSPC;
483
484         spin_lock(&bl->bl_ext_lock);
485         for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
486                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
487                     be->be_tag != EXTENT_WRITTEN)
488                         continue;
489
490                 p = xdr_reserve_space(xdr, 7 * sizeof(__be32) +
491                                         NFS4_DEVICEID4_SIZE);
492                 if (!p) {
493                         printk("%s: out of space for extent list\n", __func__);
494                         ret = -ENOSPC;
495                         break;
496                 }
497
498                 p = xdr_encode_opaque_fixed(p, be->be_devid.data,
499                                 NFS4_DEVICEID4_SIZE);
500                 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501                 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502                 p = xdr_encode_hyper(p, 0LL);
503                 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505                 be->be_tag = EXTENT_COMMITTING;
506                 count++;
507         }
508         spin_unlock(&bl->bl_ext_lock);
509
510         xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
511         xdr_start[1] = cpu_to_be32(count);
512
513         dprintk("%s found %i ranges\n", __func__, count);
514         return ret;
515 }
516
517 void
518 ext_tree_mark_committed(struct pnfs_block_layout *bl, int status)
519 {
520         struct rb_root *root = &bl->bl_ext_rw;
521         struct pnfs_block_extent *be;
522
523         dprintk("%s status %d\n", __func__, status);
524
525         spin_lock(&bl->bl_ext_lock);
526         for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
527                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
528                     be->be_tag != EXTENT_COMMITTING)
529                         continue;
530
531                 if (status) {
532                         /*
533                          * Mark as written and try again.
534                          *
535                          * XXX: some real error handling here wouldn't hurt..
536                          */
537                         be->be_tag = EXTENT_WRITTEN;
538                 } else {
539                         be->be_state = PNFS_BLOCK_READWRITE_DATA;
540                         be->be_tag = 0;
541                 }
542
543                 be = ext_try_to_merge_left(root, be);
544                 be = ext_try_to_merge_right(root, be);
545         }
546         spin_unlock(&bl->bl_ext_lock);
547 }