]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/dm-stats.c
Merge branch 'for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata
[karo-tx-linux.git] / drivers / md / dm-stats.c
1 #include <linux/errno.h>
2 #include <linux/numa.h>
3 #include <linux/slab.h>
4 #include <linux/rculist.h>
5 #include <linux/threads.h>
6 #include <linux/preempt.h>
7 #include <linux/irqflags.h>
8 #include <linux/vmalloc.h>
9 #include <linux/mm.h>
10 #include <linux/module.h>
11 #include <linux/device-mapper.h>
12
13 #include "dm.h"
14 #include "dm-stats.h"
15
16 #define DM_MSG_PREFIX "stats"
17
18 static int dm_stat_need_rcu_barrier;
19
20 /*
21  * Using 64-bit values to avoid overflow (which is a
22  * problem that block/genhd.c's IO accounting has).
23  */
24 struct dm_stat_percpu {
25         unsigned long long sectors[2];
26         unsigned long long ios[2];
27         unsigned long long merges[2];
28         unsigned long long ticks[2];
29         unsigned long long io_ticks[2];
30         unsigned long long io_ticks_total;
31         unsigned long long time_in_queue;
32         unsigned long long *histogram;
33 };
34
35 struct dm_stat_shared {
36         atomic_t in_flight[2];
37         unsigned long long stamp;
38         struct dm_stat_percpu tmp;
39 };
40
41 struct dm_stat {
42         struct list_head list_entry;
43         int id;
44         unsigned stat_flags;
45         size_t n_entries;
46         sector_t start;
47         sector_t end;
48         sector_t step;
49         unsigned n_histogram_entries;
50         unsigned long long *histogram_boundaries;
51         const char *program_id;
52         const char *aux_data;
53         struct rcu_head rcu_head;
54         size_t shared_alloc_size;
55         size_t percpu_alloc_size;
56         size_t histogram_alloc_size;
57         struct dm_stat_percpu *stat_percpu[NR_CPUS];
58         struct dm_stat_shared stat_shared[0];
59 };
60
61 #define STAT_PRECISE_TIMESTAMPS         1
62
63 struct dm_stats_last_position {
64         sector_t last_sector;
65         unsigned last_rw;
66 };
67
68 /*
69  * A typo on the command line could possibly make the kernel run out of memory
70  * and crash. To prevent the crash we account all used memory. We fail if we
71  * exhaust 1/4 of all memory or 1/2 of vmalloc space.
72  */
73 #define DM_STATS_MEMORY_FACTOR          4
74 #define DM_STATS_VMALLOC_FACTOR         2
75
76 static DEFINE_SPINLOCK(shared_memory_lock);
77
78 static unsigned long shared_memory_amount;
79
80 static bool __check_shared_memory(size_t alloc_size)
81 {
82         size_t a;
83
84         a = shared_memory_amount + alloc_size;
85         if (a < shared_memory_amount)
86                 return false;
87         if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
88                 return false;
89 #ifdef CONFIG_MMU
90         if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
91                 return false;
92 #endif
93         return true;
94 }
95
96 static bool check_shared_memory(size_t alloc_size)
97 {
98         bool ret;
99
100         spin_lock_irq(&shared_memory_lock);
101
102         ret = __check_shared_memory(alloc_size);
103
104         spin_unlock_irq(&shared_memory_lock);
105
106         return ret;
107 }
108
109 static bool claim_shared_memory(size_t alloc_size)
110 {
111         spin_lock_irq(&shared_memory_lock);
112
113         if (!__check_shared_memory(alloc_size)) {
114                 spin_unlock_irq(&shared_memory_lock);
115                 return false;
116         }
117
118         shared_memory_amount += alloc_size;
119
120         spin_unlock_irq(&shared_memory_lock);
121
122         return true;
123 }
124
125 static void free_shared_memory(size_t alloc_size)
126 {
127         unsigned long flags;
128
129         spin_lock_irqsave(&shared_memory_lock, flags);
130
131         if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
132                 spin_unlock_irqrestore(&shared_memory_lock, flags);
133                 DMCRIT("Memory usage accounting bug.");
134                 return;
135         }
136
137         shared_memory_amount -= alloc_size;
138
139         spin_unlock_irqrestore(&shared_memory_lock, flags);
140 }
141
142 static void *dm_kvzalloc(size_t alloc_size, int node)
143 {
144         void *p;
145
146         if (!claim_shared_memory(alloc_size))
147                 return NULL;
148
149         if (alloc_size <= KMALLOC_MAX_SIZE) {
150                 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
151                 if (p)
152                         return p;
153         }
154         p = vzalloc_node(alloc_size, node);
155         if (p)
156                 return p;
157
158         free_shared_memory(alloc_size);
159
160         return NULL;
161 }
162
163 static void dm_kvfree(void *ptr, size_t alloc_size)
164 {
165         if (!ptr)
166                 return;
167
168         free_shared_memory(alloc_size);
169
170         kvfree(ptr);
171 }
172
173 static void dm_stat_free(struct rcu_head *head)
174 {
175         int cpu;
176         struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
177
178         kfree(s->program_id);
179         kfree(s->aux_data);
180         for_each_possible_cpu(cpu) {
181                 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
182                 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
183         }
184         dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
185         dm_kvfree(s, s->shared_alloc_size);
186 }
187
188 static int dm_stat_in_flight(struct dm_stat_shared *shared)
189 {
190         return atomic_read(&shared->in_flight[READ]) +
191                atomic_read(&shared->in_flight[WRITE]);
192 }
193
194 void dm_stats_init(struct dm_stats *stats)
195 {
196         int cpu;
197         struct dm_stats_last_position *last;
198
199         mutex_init(&stats->mutex);
200         INIT_LIST_HEAD(&stats->list);
201         stats->last = alloc_percpu(struct dm_stats_last_position);
202         for_each_possible_cpu(cpu) {
203                 last = per_cpu_ptr(stats->last, cpu);
204                 last->last_sector = (sector_t)ULLONG_MAX;
205                 last->last_rw = UINT_MAX;
206         }
207 }
208
209 void dm_stats_cleanup(struct dm_stats *stats)
210 {
211         size_t ni;
212         struct dm_stat *s;
213         struct dm_stat_shared *shared;
214
215         while (!list_empty(&stats->list)) {
216                 s = container_of(stats->list.next, struct dm_stat, list_entry);
217                 list_del(&s->list_entry);
218                 for (ni = 0; ni < s->n_entries; ni++) {
219                         shared = &s->stat_shared[ni];
220                         if (WARN_ON(dm_stat_in_flight(shared))) {
221                                 DMCRIT("leaked in-flight counter at index %lu "
222                                        "(start %llu, end %llu, step %llu): reads %d, writes %d",
223                                        (unsigned long)ni,
224                                        (unsigned long long)s->start,
225                                        (unsigned long long)s->end,
226                                        (unsigned long long)s->step,
227                                        atomic_read(&shared->in_flight[READ]),
228                                        atomic_read(&shared->in_flight[WRITE]));
229                         }
230                 }
231                 dm_stat_free(&s->rcu_head);
232         }
233         free_percpu(stats->last);
234 }
235
236 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
237                            sector_t step, unsigned stat_flags,
238                            unsigned n_histogram_entries,
239                            unsigned long long *histogram_boundaries,
240                            const char *program_id, const char *aux_data,
241                            void (*suspend_callback)(struct mapped_device *),
242                            void (*resume_callback)(struct mapped_device *),
243                            struct mapped_device *md)
244 {
245         struct list_head *l;
246         struct dm_stat *s, *tmp_s;
247         sector_t n_entries;
248         size_t ni;
249         size_t shared_alloc_size;
250         size_t percpu_alloc_size;
251         size_t histogram_alloc_size;
252         struct dm_stat_percpu *p;
253         int cpu;
254         int ret_id;
255         int r;
256
257         if (end < start || !step)
258                 return -EINVAL;
259
260         n_entries = end - start;
261         if (dm_sector_div64(n_entries, step))
262                 n_entries++;
263
264         if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
265                 return -EOVERFLOW;
266
267         shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
268         if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
269                 return -EOVERFLOW;
270
271         percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
272         if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
273                 return -EOVERFLOW;
274
275         histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
276         if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
277                 return -EOVERFLOW;
278
279         if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
280                                  num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
281                 return -ENOMEM;
282
283         s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
284         if (!s)
285                 return -ENOMEM;
286
287         s->stat_flags = stat_flags;
288         s->n_entries = n_entries;
289         s->start = start;
290         s->end = end;
291         s->step = step;
292         s->shared_alloc_size = shared_alloc_size;
293         s->percpu_alloc_size = percpu_alloc_size;
294         s->histogram_alloc_size = histogram_alloc_size;
295
296         s->n_histogram_entries = n_histogram_entries;
297         s->histogram_boundaries = kmemdup(histogram_boundaries,
298                                           s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
299         if (!s->histogram_boundaries) {
300                 r = -ENOMEM;
301                 goto out;
302         }
303
304         s->program_id = kstrdup(program_id, GFP_KERNEL);
305         if (!s->program_id) {
306                 r = -ENOMEM;
307                 goto out;
308         }
309         s->aux_data = kstrdup(aux_data, GFP_KERNEL);
310         if (!s->aux_data) {
311                 r = -ENOMEM;
312                 goto out;
313         }
314
315         for (ni = 0; ni < n_entries; ni++) {
316                 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
317                 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
318         }
319
320         if (s->n_histogram_entries) {
321                 unsigned long long *hi;
322                 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
323                 if (!hi) {
324                         r = -ENOMEM;
325                         goto out;
326                 }
327                 for (ni = 0; ni < n_entries; ni++) {
328                         s->stat_shared[ni].tmp.histogram = hi;
329                         hi += s->n_histogram_entries + 1;
330                 }
331         }
332
333         for_each_possible_cpu(cpu) {
334                 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
335                 if (!p) {
336                         r = -ENOMEM;
337                         goto out;
338                 }
339                 s->stat_percpu[cpu] = p;
340                 if (s->n_histogram_entries) {
341                         unsigned long long *hi;
342                         hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
343                         if (!hi) {
344                                 r = -ENOMEM;
345                                 goto out;
346                         }
347                         for (ni = 0; ni < n_entries; ni++) {
348                                 p[ni].histogram = hi;
349                                 hi += s->n_histogram_entries + 1;
350                         }
351                 }
352         }
353
354         /*
355          * Suspend/resume to make sure there is no i/o in flight,
356          * so that newly created statistics will be exact.
357          *
358          * (note: we couldn't suspend earlier because we must not
359          * allocate memory while suspended)
360          */
361         suspend_callback(md);
362
363         mutex_lock(&stats->mutex);
364         s->id = 0;
365         list_for_each(l, &stats->list) {
366                 tmp_s = container_of(l, struct dm_stat, list_entry);
367                 if (WARN_ON(tmp_s->id < s->id)) {
368                         r = -EINVAL;
369                         goto out_unlock_resume;
370                 }
371                 if (tmp_s->id > s->id)
372                         break;
373                 if (unlikely(s->id == INT_MAX)) {
374                         r = -ENFILE;
375                         goto out_unlock_resume;
376                 }
377                 s->id++;
378         }
379         ret_id = s->id;
380         list_add_tail_rcu(&s->list_entry, l);
381         mutex_unlock(&stats->mutex);
382
383         resume_callback(md);
384
385         return ret_id;
386
387 out_unlock_resume:
388         mutex_unlock(&stats->mutex);
389         resume_callback(md);
390 out:
391         dm_stat_free(&s->rcu_head);
392         return r;
393 }
394
395 static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
396 {
397         struct dm_stat *s;
398
399         list_for_each_entry(s, &stats->list, list_entry) {
400                 if (s->id > id)
401                         break;
402                 if (s->id == id)
403                         return s;
404         }
405
406         return NULL;
407 }
408
409 static int dm_stats_delete(struct dm_stats *stats, int id)
410 {
411         struct dm_stat *s;
412         int cpu;
413
414         mutex_lock(&stats->mutex);
415
416         s = __dm_stats_find(stats, id);
417         if (!s) {
418                 mutex_unlock(&stats->mutex);
419                 return -ENOENT;
420         }
421
422         list_del_rcu(&s->list_entry);
423         mutex_unlock(&stats->mutex);
424
425         /*
426          * vfree can't be called from RCU callback
427          */
428         for_each_possible_cpu(cpu)
429                 if (is_vmalloc_addr(s->stat_percpu) ||
430                     is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
431                         goto do_sync_free;
432         if (is_vmalloc_addr(s) ||
433             is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
434 do_sync_free:
435                 synchronize_rcu_expedited();
436                 dm_stat_free(&s->rcu_head);
437         } else {
438                 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
439                 call_rcu(&s->rcu_head, dm_stat_free);
440         }
441         return 0;
442 }
443
444 static int dm_stats_list(struct dm_stats *stats, const char *program,
445                          char *result, unsigned maxlen)
446 {
447         struct dm_stat *s;
448         sector_t len;
449         unsigned sz = 0;
450
451         /*
452          * Output format:
453          *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
454          */
455
456         mutex_lock(&stats->mutex);
457         list_for_each_entry(s, &stats->list, list_entry) {
458                 if (!program || !strcmp(program, s->program_id)) {
459                         len = s->end - s->start;
460                         DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
461                                 (unsigned long long)s->start,
462                                 (unsigned long long)len,
463                                 (unsigned long long)s->step,
464                                 s->program_id,
465                                 s->aux_data);
466                         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
467                                 DMEMIT(" precise_timestamps");
468                         if (s->n_histogram_entries) {
469                                 unsigned i;
470                                 DMEMIT(" histogram:");
471                                 for (i = 0; i < s->n_histogram_entries; i++) {
472                                         if (i)
473                                                 DMEMIT(",");
474                                         DMEMIT("%llu", s->histogram_boundaries[i]);
475                                 }
476                         }
477                         DMEMIT("\n");
478                 }
479         }
480         mutex_unlock(&stats->mutex);
481
482         return 1;
483 }
484
485 static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
486                           struct dm_stat_percpu *p)
487 {
488         /*
489          * This is racy, but so is part_round_stats_single.
490          */
491         unsigned long long now, difference;
492         unsigned in_flight_read, in_flight_write;
493
494         if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
495                 now = jiffies;
496         else
497                 now = ktime_to_ns(ktime_get());
498
499         difference = now - shared->stamp;
500         if (!difference)
501                 return;
502
503         in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
504         in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
505         if (in_flight_read)
506                 p->io_ticks[READ] += difference;
507         if (in_flight_write)
508                 p->io_ticks[WRITE] += difference;
509         if (in_flight_read + in_flight_write) {
510                 p->io_ticks_total += difference;
511                 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
512         }
513         shared->stamp = now;
514 }
515
516 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
517                               unsigned long bi_rw, sector_t len,
518                               struct dm_stats_aux *stats_aux, bool end,
519                               unsigned long duration_jiffies)
520 {
521         unsigned long idx = bi_rw & REQ_WRITE;
522         struct dm_stat_shared *shared = &s->stat_shared[entry];
523         struct dm_stat_percpu *p;
524
525         /*
526          * For strict correctness we should use local_irq_save/restore
527          * instead of preempt_disable/enable.
528          *
529          * preempt_disable/enable is racy if the driver finishes bios
530          * from non-interrupt context as well as from interrupt context
531          * or from more different interrupts.
532          *
533          * On 64-bit architectures the race only results in not counting some
534          * events, so it is acceptable.  On 32-bit architectures the race could
535          * cause the counter going off by 2^32, so we need to do proper locking
536          * there.
537          *
538          * part_stat_lock()/part_stat_unlock() have this race too.
539          */
540 #if BITS_PER_LONG == 32
541         unsigned long flags;
542         local_irq_save(flags);
543 #else
544         preempt_disable();
545 #endif
546         p = &s->stat_percpu[smp_processor_id()][entry];
547
548         if (!end) {
549                 dm_stat_round(s, shared, p);
550                 atomic_inc(&shared->in_flight[idx]);
551         } else {
552                 unsigned long long duration;
553                 dm_stat_round(s, shared, p);
554                 atomic_dec(&shared->in_flight[idx]);
555                 p->sectors[idx] += len;
556                 p->ios[idx] += 1;
557                 p->merges[idx] += stats_aux->merged;
558                 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
559                         p->ticks[idx] += duration_jiffies;
560                         duration = jiffies_to_msecs(duration_jiffies);
561                 } else {
562                         p->ticks[idx] += stats_aux->duration_ns;
563                         duration = stats_aux->duration_ns;
564                 }
565                 if (s->n_histogram_entries) {
566                         unsigned lo = 0, hi = s->n_histogram_entries + 1;
567                         while (lo + 1 < hi) {
568                                 unsigned mid = (lo + hi) / 2;
569                                 if (s->histogram_boundaries[mid - 1] > duration) {
570                                         hi = mid;
571                                 } else {
572                                         lo = mid;
573                                 }
574
575                         }
576                         p->histogram[lo]++;
577                 }
578         }
579
580 #if BITS_PER_LONG == 32
581         local_irq_restore(flags);
582 #else
583         preempt_enable();
584 #endif
585 }
586
587 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
588                           sector_t bi_sector, sector_t end_sector,
589                           bool end, unsigned long duration_jiffies,
590                           struct dm_stats_aux *stats_aux)
591 {
592         sector_t rel_sector, offset, todo, fragment_len;
593         size_t entry;
594
595         if (end_sector <= s->start || bi_sector >= s->end)
596                 return;
597         if (unlikely(bi_sector < s->start)) {
598                 rel_sector = 0;
599                 todo = end_sector - s->start;
600         } else {
601                 rel_sector = bi_sector - s->start;
602                 todo = end_sector - bi_sector;
603         }
604         if (unlikely(end_sector > s->end))
605                 todo -= (end_sector - s->end);
606
607         offset = dm_sector_div64(rel_sector, s->step);
608         entry = rel_sector;
609         do {
610                 if (WARN_ON_ONCE(entry >= s->n_entries)) {
611                         DMCRIT("Invalid area access in region id %d", s->id);
612                         return;
613                 }
614                 fragment_len = todo;
615                 if (fragment_len > s->step - offset)
616                         fragment_len = s->step - offset;
617                 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
618                                   stats_aux, end, duration_jiffies);
619                 todo -= fragment_len;
620                 entry++;
621                 offset = 0;
622         } while (unlikely(todo != 0));
623 }
624
625 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
626                          sector_t bi_sector, unsigned bi_sectors, bool end,
627                          unsigned long duration_jiffies,
628                          struct dm_stats_aux *stats_aux)
629 {
630         struct dm_stat *s;
631         sector_t end_sector;
632         struct dm_stats_last_position *last;
633         bool got_precise_time;
634
635         if (unlikely(!bi_sectors))
636                 return;
637
638         end_sector = bi_sector + bi_sectors;
639
640         if (!end) {
641                 /*
642                  * A race condition can at worst result in the merged flag being
643                  * misrepresented, so we don't have to disable preemption here.
644                  */
645                 last = raw_cpu_ptr(stats->last);
646                 stats_aux->merged =
647                         (bi_sector == (ACCESS_ONCE(last->last_sector) &&
648                                        ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
649                                         (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
650                                        ));
651                 ACCESS_ONCE(last->last_sector) = end_sector;
652                 ACCESS_ONCE(last->last_rw) = bi_rw;
653         }
654
655         rcu_read_lock();
656
657         got_precise_time = false;
658         list_for_each_entry_rcu(s, &stats->list, list_entry) {
659                 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
660                         if (!end)
661                                 stats_aux->duration_ns = ktime_to_ns(ktime_get());
662                         else
663                                 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
664                         got_precise_time = true;
665                 }
666                 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
667         }
668
669         rcu_read_unlock();
670 }
671
672 static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
673                                                    struct dm_stat *s, size_t x)
674 {
675         int cpu;
676         struct dm_stat_percpu *p;
677
678         local_irq_disable();
679         p = &s->stat_percpu[smp_processor_id()][x];
680         dm_stat_round(s, shared, p);
681         local_irq_enable();
682
683         shared->tmp.sectors[READ] = 0;
684         shared->tmp.sectors[WRITE] = 0;
685         shared->tmp.ios[READ] = 0;
686         shared->tmp.ios[WRITE] = 0;
687         shared->tmp.merges[READ] = 0;
688         shared->tmp.merges[WRITE] = 0;
689         shared->tmp.ticks[READ] = 0;
690         shared->tmp.ticks[WRITE] = 0;
691         shared->tmp.io_ticks[READ] = 0;
692         shared->tmp.io_ticks[WRITE] = 0;
693         shared->tmp.io_ticks_total = 0;
694         shared->tmp.time_in_queue = 0;
695
696         if (s->n_histogram_entries)
697                 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
698
699         for_each_possible_cpu(cpu) {
700                 p = &s->stat_percpu[cpu][x];
701                 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
702                 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
703                 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
704                 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
705                 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
706                 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
707                 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
708                 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
709                 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
710                 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
711                 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
712                 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
713                 if (s->n_histogram_entries) {
714                         unsigned i;
715                         for (i = 0; i < s->n_histogram_entries + 1; i++)
716                                 shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
717                 }
718         }
719 }
720
721 static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
722                             bool init_tmp_percpu_totals)
723 {
724         size_t x;
725         struct dm_stat_shared *shared;
726         struct dm_stat_percpu *p;
727
728         for (x = idx_start; x < idx_end; x++) {
729                 shared = &s->stat_shared[x];
730                 if (init_tmp_percpu_totals)
731                         __dm_stat_init_temporary_percpu_totals(shared, s, x);
732                 local_irq_disable();
733                 p = &s->stat_percpu[smp_processor_id()][x];
734                 p->sectors[READ] -= shared->tmp.sectors[READ];
735                 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
736                 p->ios[READ] -= shared->tmp.ios[READ];
737                 p->ios[WRITE] -= shared->tmp.ios[WRITE];
738                 p->merges[READ] -= shared->tmp.merges[READ];
739                 p->merges[WRITE] -= shared->tmp.merges[WRITE];
740                 p->ticks[READ] -= shared->tmp.ticks[READ];
741                 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
742                 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
743                 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
744                 p->io_ticks_total -= shared->tmp.io_ticks_total;
745                 p->time_in_queue -= shared->tmp.time_in_queue;
746                 local_irq_enable();
747                 if (s->n_histogram_entries) {
748                         unsigned i;
749                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
750                                 local_irq_disable();
751                                 p = &s->stat_percpu[smp_processor_id()][x];
752                                 p->histogram[i] -= shared->tmp.histogram[i];
753                                 local_irq_enable();
754                         }
755                 }
756         }
757 }
758
759 static int dm_stats_clear(struct dm_stats *stats, int id)
760 {
761         struct dm_stat *s;
762
763         mutex_lock(&stats->mutex);
764
765         s = __dm_stats_find(stats, id);
766         if (!s) {
767                 mutex_unlock(&stats->mutex);
768                 return -ENOENT;
769         }
770
771         __dm_stat_clear(s, 0, s->n_entries, true);
772
773         mutex_unlock(&stats->mutex);
774
775         return 1;
776 }
777
778 /*
779  * This is like jiffies_to_msec, but works for 64-bit values.
780  */
781 static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
782 {
783         unsigned long long result;
784         unsigned mult;
785
786         if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
787                 return j;
788
789         result = 0;
790         if (j)
791                 result = jiffies_to_msecs(j & 0x3fffff);
792         if (j >= 1 << 22) {
793                 mult = jiffies_to_msecs(1 << 22);
794                 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
795         }
796         if (j >= 1ULL << 44)
797                 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
798
799         return result;
800 }
801
802 static int dm_stats_print(struct dm_stats *stats, int id,
803                           size_t idx_start, size_t idx_len,
804                           bool clear, char *result, unsigned maxlen)
805 {
806         unsigned sz = 0;
807         struct dm_stat *s;
808         size_t x;
809         sector_t start, end, step;
810         size_t idx_end;
811         struct dm_stat_shared *shared;
812
813         /*
814          * Output format:
815          *   <start_sector>+<length> counters
816          */
817
818         mutex_lock(&stats->mutex);
819
820         s = __dm_stats_find(stats, id);
821         if (!s) {
822                 mutex_unlock(&stats->mutex);
823                 return -ENOENT;
824         }
825
826         idx_end = idx_start + idx_len;
827         if (idx_end < idx_start ||
828             idx_end > s->n_entries)
829                 idx_end = s->n_entries;
830
831         if (idx_start > idx_end)
832                 idx_start = idx_end;
833
834         step = s->step;
835         start = s->start + (step * idx_start);
836
837         for (x = idx_start; x < idx_end; x++, start = end) {
838                 shared = &s->stat_shared[x];
839                 end = start + step;
840                 if (unlikely(end > s->end))
841                         end = s->end;
842
843                 __dm_stat_init_temporary_percpu_totals(shared, s, x);
844
845                 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
846                        (unsigned long long)start,
847                        (unsigned long long)step,
848                        shared->tmp.ios[READ],
849                        shared->tmp.merges[READ],
850                        shared->tmp.sectors[READ],
851                        dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
852                        shared->tmp.ios[WRITE],
853                        shared->tmp.merges[WRITE],
854                        shared->tmp.sectors[WRITE],
855                        dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
856                        dm_stat_in_flight(shared),
857                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
858                        dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
859                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
860                        dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
861                 if (s->n_histogram_entries) {
862                         unsigned i;
863                         for (i = 0; i < s->n_histogram_entries + 1; i++) {
864                                 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
865                         }
866                 }
867                 DMEMIT("\n");
868
869                 if (unlikely(sz + 1 >= maxlen))
870                         goto buffer_overflow;
871         }
872
873         if (clear)
874                 __dm_stat_clear(s, idx_start, idx_end, false);
875
876 buffer_overflow:
877         mutex_unlock(&stats->mutex);
878
879         return 1;
880 }
881
882 static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
883 {
884         struct dm_stat *s;
885         const char *new_aux_data;
886
887         mutex_lock(&stats->mutex);
888
889         s = __dm_stats_find(stats, id);
890         if (!s) {
891                 mutex_unlock(&stats->mutex);
892                 return -ENOENT;
893         }
894
895         new_aux_data = kstrdup(aux_data, GFP_KERNEL);
896         if (!new_aux_data) {
897                 mutex_unlock(&stats->mutex);
898                 return -ENOMEM;
899         }
900
901         kfree(s->aux_data);
902         s->aux_data = new_aux_data;
903
904         mutex_unlock(&stats->mutex);
905
906         return 0;
907 }
908
909 static int parse_histogram(const char *h, unsigned *n_histogram_entries,
910                            unsigned long long **histogram_boundaries)
911 {
912         const char *q;
913         unsigned n;
914         unsigned long long last;
915
916         *n_histogram_entries = 1;
917         for (q = h; *q; q++)
918                 if (*q == ',')
919                         (*n_histogram_entries)++;
920
921         *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
922         if (!*histogram_boundaries)
923                 return -ENOMEM;
924
925         n = 0;
926         last = 0;
927         while (1) {
928                 unsigned long long hi;
929                 int s;
930                 char ch;
931                 s = sscanf(h, "%llu%c", &hi, &ch);
932                 if (!s || (s == 2 && ch != ','))
933                         return -EINVAL;
934                 if (hi <= last)
935                         return -EINVAL;
936                 last = hi;
937                 (*histogram_boundaries)[n] = hi;
938                 if (s == 1)
939                         return 0;
940                 h = strchr(h, ',') + 1;
941                 n++;
942         }
943 }
944
945 static int message_stats_create(struct mapped_device *md,
946                                 unsigned argc, char **argv,
947                                 char *result, unsigned maxlen)
948 {
949         int r;
950         int id;
951         char dummy;
952         unsigned long long start, end, len, step;
953         unsigned divisor;
954         const char *program_id, *aux_data;
955         unsigned stat_flags = 0;
956
957         unsigned n_histogram_entries = 0;
958         unsigned long long *histogram_boundaries = NULL;
959
960         struct dm_arg_set as, as_backup;
961         const char *a;
962         unsigned feature_args;
963
964         /*
965          * Input format:
966          *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
967          */
968
969         if (argc < 3)
970                 goto ret_einval;
971
972         as.argc = argc;
973         as.argv = argv;
974         dm_consume_args(&as, 1);
975
976         a = dm_shift_arg(&as);
977         if (!strcmp(a, "-")) {
978                 start = 0;
979                 len = dm_get_size(md);
980                 if (!len)
981                         len = 1;
982         } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
983                    start != (sector_t)start || len != (sector_t)len)
984                 goto ret_einval;
985
986         end = start + len;
987         if (start >= end)
988                 goto ret_einval;
989
990         a = dm_shift_arg(&as);
991         if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
992                 if (!divisor)
993                         return -EINVAL;
994                 step = end - start;
995                 if (do_div(step, divisor))
996                         step++;
997                 if (!step)
998                         step = 1;
999         } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
1000                    step != (sector_t)step || !step)
1001                 goto ret_einval;
1002
1003         as_backup = as;
1004         a = dm_shift_arg(&as);
1005         if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
1006                 while (feature_args--) {
1007                         a = dm_shift_arg(&as);
1008                         if (!a)
1009                                 goto ret_einval;
1010                         if (!strcasecmp(a, "precise_timestamps"))
1011                                 stat_flags |= STAT_PRECISE_TIMESTAMPS;
1012                         else if (!strncasecmp(a, "histogram:", 10)) {
1013                                 if (n_histogram_entries)
1014                                         goto ret_einval;
1015                                 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
1016                                         goto ret;
1017                         } else
1018                                 goto ret_einval;
1019                 }
1020         } else {
1021                 as = as_backup;
1022         }
1023
1024         program_id = "-";
1025         aux_data = "-";
1026
1027         a = dm_shift_arg(&as);
1028         if (a)
1029                 program_id = a;
1030
1031         a = dm_shift_arg(&as);
1032         if (a)
1033                 aux_data = a;
1034
1035         if (as.argc)
1036                 goto ret_einval;
1037
1038         /*
1039          * If a buffer overflow happens after we created the region,
1040          * it's too late (the userspace would retry with a larger
1041          * buffer, but the region id that caused the overflow is already
1042          * leaked).  So we must detect buffer overflow in advance.
1043          */
1044         snprintf(result, maxlen, "%d", INT_MAX);
1045         if (dm_message_test_buffer_overflow(result, maxlen)) {
1046                 r = 1;
1047                 goto ret;
1048         }
1049
1050         id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1051                              n_histogram_entries, histogram_boundaries, program_id, aux_data,
1052                              dm_internal_suspend_fast, dm_internal_resume_fast, md);
1053         if (id < 0) {
1054                 r = id;
1055                 goto ret;
1056         }
1057
1058         snprintf(result, maxlen, "%d", id);
1059
1060         r = 1;
1061         goto ret;
1062
1063 ret_einval:
1064         r = -EINVAL;
1065 ret:
1066         kfree(histogram_boundaries);
1067         return r;
1068 }
1069
1070 static int message_stats_delete(struct mapped_device *md,
1071                                 unsigned argc, char **argv)
1072 {
1073         int id;
1074         char dummy;
1075
1076         if (argc != 2)
1077                 return -EINVAL;
1078
1079         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1080                 return -EINVAL;
1081
1082         return dm_stats_delete(dm_get_stats(md), id);
1083 }
1084
1085 static int message_stats_clear(struct mapped_device *md,
1086                                unsigned argc, char **argv)
1087 {
1088         int id;
1089         char dummy;
1090
1091         if (argc != 2)
1092                 return -EINVAL;
1093
1094         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1095                 return -EINVAL;
1096
1097         return dm_stats_clear(dm_get_stats(md), id);
1098 }
1099
1100 static int message_stats_list(struct mapped_device *md,
1101                               unsigned argc, char **argv,
1102                               char *result, unsigned maxlen)
1103 {
1104         int r;
1105         const char *program = NULL;
1106
1107         if (argc < 1 || argc > 2)
1108                 return -EINVAL;
1109
1110         if (argc > 1) {
1111                 program = kstrdup(argv[1], GFP_KERNEL);
1112                 if (!program)
1113                         return -ENOMEM;
1114         }
1115
1116         r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1117
1118         kfree(program);
1119
1120         return r;
1121 }
1122
1123 static int message_stats_print(struct mapped_device *md,
1124                                unsigned argc, char **argv, bool clear,
1125                                char *result, unsigned maxlen)
1126 {
1127         int id;
1128         char dummy;
1129         unsigned long idx_start = 0, idx_len = ULONG_MAX;
1130
1131         if (argc != 2 && argc != 4)
1132                 return -EINVAL;
1133
1134         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1135                 return -EINVAL;
1136
1137         if (argc > 3) {
1138                 if (strcmp(argv[2], "-") &&
1139                     sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1140                         return -EINVAL;
1141                 if (strcmp(argv[3], "-") &&
1142                     sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1143                         return -EINVAL;
1144         }
1145
1146         return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1147                               result, maxlen);
1148 }
1149
1150 static int message_stats_set_aux(struct mapped_device *md,
1151                                  unsigned argc, char **argv)
1152 {
1153         int id;
1154         char dummy;
1155
1156         if (argc != 3)
1157                 return -EINVAL;
1158
1159         if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1160                 return -EINVAL;
1161
1162         return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1163 }
1164
1165 int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
1166                      char *result, unsigned maxlen)
1167 {
1168         int r;
1169
1170         /* All messages here must start with '@' */
1171         if (!strcasecmp(argv[0], "@stats_create"))
1172                 r = message_stats_create(md, argc, argv, result, maxlen);
1173         else if (!strcasecmp(argv[0], "@stats_delete"))
1174                 r = message_stats_delete(md, argc, argv);
1175         else if (!strcasecmp(argv[0], "@stats_clear"))
1176                 r = message_stats_clear(md, argc, argv);
1177         else if (!strcasecmp(argv[0], "@stats_list"))
1178                 r = message_stats_list(md, argc, argv, result, maxlen);
1179         else if (!strcasecmp(argv[0], "@stats_print"))
1180                 r = message_stats_print(md, argc, argv, false, result, maxlen);
1181         else if (!strcasecmp(argv[0], "@stats_print_clear"))
1182                 r = message_stats_print(md, argc, argv, true, result, maxlen);
1183         else if (!strcasecmp(argv[0], "@stats_set_aux"))
1184                 r = message_stats_set_aux(md, argc, argv);
1185         else
1186                 return 2; /* this wasn't a stats message */
1187
1188         if (r == -EINVAL)
1189                 DMWARN("Invalid parameters for message %s", argv[0]);
1190
1191         return r;
1192 }
1193
1194 int __init dm_statistics_init(void)
1195 {
1196         shared_memory_amount = 0;
1197         dm_stat_need_rcu_barrier = 0;
1198         return 0;
1199 }
1200
1201 void dm_statistics_exit(void)
1202 {
1203         if (dm_stat_need_rcu_barrier)
1204                 rcu_barrier();
1205         if (WARN_ON(shared_memory_amount))
1206                 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1207 }
1208
1209 module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
1210 MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");