]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/orangefs/file.c
orangefs: make precopy_buffers() take iov_iter
[karo-tx-linux.git] / fs / orangefs / file.c
1 /*
2  * (C) 2001 Clemson University and The University of Chicago
3  *
4  * See COPYING in top-level directory.
5  */
6
7 /*
8  *  Linux VFS file operations.
9  */
10
11 #include "protocol.h"
12 #include "pvfs2-kernel.h"
13 #include "pvfs2-bufmap.h"
14 #include <linux/fs.h>
15 #include <linux/pagemap.h>
16
17 #define wake_up_daemon_for_return(op)                   \
18 do {                                                    \
19         spin_lock(&op->lock);                           \
20         op->io_completed = 1;                           \
21         spin_unlock(&op->lock);                         \
22         wake_up_interruptible(&op->io_completion_waitq);\
23 } while (0)
24
25 /*
26  * Copy to client-core's address space from the buffers specified
27  * by the iovec upto total_size bytes.
28  * NOTE: the iovector can either contain addresses which
29  *       can futher be kernel-space or user-space addresses.
30  *       or it can pointers to struct page's
31  */
32 static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33                            int buffer_index,
34                            struct iov_iter *iter,
35                            size_t total_size)
36 {
37         int ret = 0;
38         /*
39          * copy data from application/kernel by pulling it out
40          * of the iovec.
41          */
42
43
44         if (total_size) {
45                 ret = pvfs_bufmap_copy_from_iovec(bufmap,
46                                                 iter,
47                                                 buffer_index,
48                                                 total_size);
49                 if (ret < 0)
50                 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
51                            __func__,
52                            (long)ret);
53         }
54
55         if (ret < 0)
56                 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
57                         __func__,
58                         (long)ret);
59         return ret;
60 }
61
62 /*
63  * Copy from client-core's address space to the buffers specified
64  * by the iovec upto total_size bytes.
65  * NOTE: the iovector can either contain addresses which
66  *       can futher be kernel-space or user-space addresses.
67  *       or it can pointers to struct page's
68  */
69 static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
70                             int buffer_index,
71                             struct iov_iter *iter,
72                             size_t total_size)
73 {
74         int ret = 0;
75         /*
76          * copy data to application/kernel by pushing it out to
77          * the iovec. NOTE; target buffers can be addresses or
78          * struct page pointers.
79          */
80         if (total_size) {
81                 ret = pvfs_bufmap_copy_to_iovec(bufmap,
82                                                 iter,
83                                                 buffer_index,
84                                                 total_size);
85                 if (ret < 0)
86                         gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
87                                 __func__,
88                                 (long)ret);
89         }
90         return ret;
91 }
92
93 /*
94  * Post and wait for the I/O upcall to finish
95  */
96 static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
97                 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
98                 size_t total_size, loff_t readahead_size)
99 {
100         struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
101         struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
102         struct pvfs2_bufmap *bufmap = NULL;
103         struct pvfs2_kernel_op_s *new_op = NULL;
104         int buffer_index = -1;
105         ssize_t ret;
106
107         new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
108         if (!new_op) {
109                 ret = -ENOMEM;
110                 goto out;
111         }
112         /* synchronous I/O */
113         new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
114         new_op->upcall.req.io.readahead_size = readahead_size;
115         new_op->upcall.req.io.io_type = type;
116         new_op->upcall.req.io.refn = pvfs2_inode->refn;
117
118 populate_shared_memory:
119         /* get a shared buffer index */
120         ret = pvfs_bufmap_get(&bufmap, &buffer_index);
121         if (ret < 0) {
122                 gossip_debug(GOSSIP_FILE_DEBUG,
123                              "%s: pvfs_bufmap_get failure (%ld)\n",
124                              __func__, (long)ret);
125                 goto out;
126         }
127         gossip_debug(GOSSIP_FILE_DEBUG,
128                      "%s(%pU): GET op %p -> buffer_index %d\n",
129                      __func__,
130                      handle,
131                      new_op,
132                      buffer_index);
133
134         new_op->uses_shared_memory = 1;
135         new_op->upcall.req.io.buf_index = buffer_index;
136         new_op->upcall.req.io.count = total_size;
137         new_op->upcall.req.io.offset = *offset;
138
139         gossip_debug(GOSSIP_FILE_DEBUG,
140                      "%s(%pU): nr_segs %lu, offset: %llu total_size: %zd\n",
141                      __func__,
142                      handle,
143                      nr_segs,
144                      llu(*offset),
145                      total_size);
146         /*
147          * Stage 1: copy the buffers into client-core's address space
148          * precopy_buffers only pertains to writes.
149          */
150         if (type == PVFS_IO_WRITE) {
151                 struct iov_iter iter;
152                 iov_iter_init(&iter, WRITE, vec, nr_segs, total_size);
153                 ret = precopy_buffers(bufmap,
154                                       buffer_index,
155                                       &iter,
156                                       total_size);
157                 if (ret < 0)
158                         goto out;
159         }
160
161         gossip_debug(GOSSIP_FILE_DEBUG,
162                      "%s(%pU): Calling post_io_request with tag (%llu)\n",
163                      __func__,
164                      handle,
165                      llu(new_op->tag));
166
167         /* Stage 2: Service the I/O operation */
168         ret = service_operation(new_op,
169                                 type == PVFS_IO_WRITE ?
170                                         "file_write" :
171                                         "file_read",
172                                 get_interruptible_flag(inode));
173
174         /*
175          * If service_operation() returns -EAGAIN #and# the operation was
176          * purged from pvfs2_request_list or htable_ops_in_progress, then
177          * we know that the client was restarted, causing the shared memory
178          * area to be wiped clean.  To restart a  write operation in this
179          * case, we must re-copy the data from the user's iovec to a NEW
180          * shared memory location. To restart a read operation, we must get
181          * a new shared memory location.
182          */
183         if (ret == -EAGAIN && op_state_purged(new_op)) {
184                 pvfs_bufmap_put(bufmap, buffer_index);
185                 gossip_debug(GOSSIP_FILE_DEBUG,
186                              "%s:going to repopulate_shared_memory.\n",
187                              __func__);
188                 goto populate_shared_memory;
189         }
190
191         if (ret < 0) {
192                 handle_io_error(); /* defined in pvfs2-kernel.h */
193                 /*
194                  * don't write an error to syslog on signaled operation
195                  * termination unless we've got debugging turned on, as
196                  * this can happen regularly (i.e. ctrl-c)
197                  */
198                 if (ret == -EINTR)
199                         gossip_debug(GOSSIP_FILE_DEBUG,
200                                      "%s: returning error %ld\n", __func__,
201                                      (long)ret);
202                 else
203                         gossip_err("%s: error in %s handle %pU, returning %zd\n",
204                                 __func__,
205                                 type == PVFS_IO_READ ?
206                                         "read from" : "write to",
207                                 handle, ret);
208                 goto out;
209         }
210
211         /*
212          * Stage 3: Post copy buffers from client-core's address space
213          * postcopy_buffers only pertains to reads.
214          */
215         if (type == PVFS_IO_READ) {
216                 struct iov_iter iter;
217                 iov_iter_init(&iter, READ, vec, nr_segs, new_op->downcall.resp.io.amt_complete);
218                 ret = postcopy_buffers(bufmap,
219                                        buffer_index,
220                                        &iter,
221                                        new_op->downcall.resp.io.amt_complete);
222                 if (ret < 0) {
223                         /*
224                          * put error codes in downcall so that handle_io_error()
225                          * preserves it properly
226                          */
227                         new_op->downcall.status = ret;
228                         handle_io_error();
229                         goto out;
230                 }
231         }
232         gossip_debug(GOSSIP_FILE_DEBUG,
233             "%s(%pU): Amount written as returned by the sys-io call:%d\n",
234             __func__,
235             handle,
236             (int)new_op->downcall.resp.io.amt_complete);
237
238         ret = new_op->downcall.resp.io.amt_complete;
239
240         /*
241          * tell the device file owner waiting on I/O that this read has
242          * completed and it can return now.  in this exact case, on
243          * wakeup the daemon will free the op, so we *cannot* touch it
244          * after this.
245          */
246         wake_up_daemon_for_return(new_op);
247         new_op = NULL;
248
249 out:
250         if (buffer_index >= 0) {
251                 pvfs_bufmap_put(bufmap, buffer_index);
252                 gossip_debug(GOSSIP_FILE_DEBUG,
253                              "%s(%pU): PUT buffer_index %d\n",
254                              __func__, handle, buffer_index);
255                 buffer_index = -1;
256         }
257         if (new_op) {
258                 op_release(new_op);
259                 new_op = NULL;
260         }
261         return ret;
262 }
263
264 /*
265  * The reason we need to do this is to be able to support readv and writev
266  * that are larger than (pvfs_bufmap_size_query()) Default is
267  * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
268  * create a new io vec descriptor for those memory addresses that
269  * go beyond the limit. Return value for this routine is negative in case
270  * of errors and 0 in case of success.
271  *
272  * Further, the new_nr_segs pointer is updated to hold the new value
273  * of number of iovecs, the new_vec pointer is updated to hold the pointer
274  * to the new split iovec, and the size array is an array of integers holding
275  * the number of iovecs that straddle pvfs_bufmap_size_query().
276  * The max_new_nr_segs value is computed by the caller and returned.
277  * (It will be (count of all iov_len/ block_size) + 1).
278  */
279 static int split_iovecs(unsigned long max_new_nr_segs,          /* IN */
280                         unsigned long nr_segs,                  /* IN */
281                         const struct iovec *original_iovec,     /* IN */
282                         unsigned long *new_nr_segs,             /* OUT */
283                         struct iovec **new_vec,                 /* OUT */
284                         unsigned long *seg_count,               /* OUT */
285                         unsigned long **seg_array)              /* OUT */
286 {
287         unsigned long seg;
288         unsigned long count = 0;
289         unsigned long begin_seg;
290         unsigned long tmpnew_nr_segs = 0;
291         struct iovec *new_iovec = NULL;
292         struct iovec *orig_iovec;
293         unsigned long *sizes = NULL;
294         unsigned long sizes_count = 0;
295
296         if (nr_segs <= 0 ||
297             original_iovec == NULL ||
298             new_nr_segs == NULL ||
299             new_vec == NULL ||
300             seg_count == NULL ||
301             seg_array == NULL ||
302             max_new_nr_segs <= 0) {
303                 gossip_err("Invalid parameters to split_iovecs\n");
304                 return -EINVAL;
305         }
306         *new_nr_segs = 0;
307         *new_vec = NULL;
308         *seg_count = 0;
309         *seg_array = NULL;
310         /* copy the passed in iovec descriptor to a temp structure */
311         orig_iovec = kmalloc_array(nr_segs,
312                                    sizeof(*orig_iovec),
313                                    PVFS2_BUFMAP_GFP_FLAGS);
314         if (orig_iovec == NULL) {
315                 gossip_err(
316                     "split_iovecs: Could not allocate memory for %lu bytes!\n",
317                     (unsigned long)(nr_segs * sizeof(*orig_iovec)));
318                 return -ENOMEM;
319         }
320         new_iovec = kcalloc(max_new_nr_segs,
321                             sizeof(*new_iovec),
322                             PVFS2_BUFMAP_GFP_FLAGS);
323         if (new_iovec == NULL) {
324                 kfree(orig_iovec);
325                 gossip_err(
326                     "split_iovecs: Could not allocate memory for %lu bytes!\n",
327                     (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
328                 return -ENOMEM;
329         }
330         sizes = kcalloc(max_new_nr_segs,
331                         sizeof(*sizes),
332                         PVFS2_BUFMAP_GFP_FLAGS);
333         if (sizes == NULL) {
334                 kfree(new_iovec);
335                 kfree(orig_iovec);
336                 gossip_err(
337                     "split_iovecs: Could not allocate memory for %lu bytes!\n",
338                     (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
339                 return -ENOMEM;
340         }
341         /* copy the passed in iovec to a temp structure */
342         memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
343         begin_seg = 0;
344 repeat:
345         for (seg = begin_seg; seg < nr_segs; seg++) {
346                 if (tmpnew_nr_segs >= max_new_nr_segs ||
347                     sizes_count >= max_new_nr_segs) {
348                         kfree(sizes);
349                         kfree(orig_iovec);
350                         kfree(new_iovec);
351                         gossip_err
352                             ("split_iovecs: exceeded the index limit (%lu)\n",
353                             tmpnew_nr_segs);
354                         return -EINVAL;
355                 }
356                 if (count + orig_iovec[seg].iov_len <
357                     pvfs_bufmap_size_query()) {
358                         count += orig_iovec[seg].iov_len;
359                         memcpy(&new_iovec[tmpnew_nr_segs],
360                                &orig_iovec[seg],
361                                sizeof(*new_iovec));
362                         tmpnew_nr_segs++;
363                         sizes[sizes_count]++;
364                 } else {
365                         new_iovec[tmpnew_nr_segs].iov_base =
366                             orig_iovec[seg].iov_base;
367                         new_iovec[tmpnew_nr_segs].iov_len =
368                             (pvfs_bufmap_size_query() - count);
369                         tmpnew_nr_segs++;
370                         sizes[sizes_count]++;
371                         sizes_count++;
372                         begin_seg = seg;
373                         orig_iovec[seg].iov_base +=
374                             (pvfs_bufmap_size_query() - count);
375                         orig_iovec[seg].iov_len -=
376                             (pvfs_bufmap_size_query() - count);
377                         count = 0;
378                         break;
379                 }
380         }
381         if (seg != nr_segs)
382                 goto repeat;
383         else
384                 sizes_count++;
385
386         *new_nr_segs = tmpnew_nr_segs;
387         /* new_iovec is freed by the caller */
388         *new_vec = new_iovec;
389         *seg_count = sizes_count;
390         /* seg_array is also freed by the caller */
391         *seg_array = sizes;
392         kfree(orig_iovec);
393         return 0;
394 }
395
396 static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
397                              ssize_t *total_count)
398 {
399         unsigned long i;
400         long max_nr_iovecs;
401         ssize_t total;
402         ssize_t count;
403
404         total = 0;
405         count = 0;
406         max_nr_iovecs = 0;
407         for (i = 0; i < nr_segs; i++) {
408                 const struct iovec *iv = &curr[i];
409
410                 count += iv->iov_len;
411                 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
412                         return -EINVAL;
413                 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
414                         total += iv->iov_len;
415                         max_nr_iovecs++;
416                 } else {
417                         total =
418                             (total + iv->iov_len - pvfs_bufmap_size_query());
419                         max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
420                 }
421         }
422         *total_count = count;
423         return max_nr_iovecs;
424 }
425
426 /*
427  * Common entry point for read/write/readv/writev
428  * This function will dispatch it to either the direct I/O
429  * or buffered I/O path depending on the mount options and/or
430  * augmented/extended metadata attached to the file.
431  * Note: File extended attributes override any mount options.
432  */
433 static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
434                 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
435 {
436         struct inode *inode = file->f_mapping->host;
437         struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
438         struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
439         ssize_t ret;
440         ssize_t total_count;
441         unsigned int to_free;
442         size_t count;
443         unsigned long seg;
444         unsigned long new_nr_segs;
445         unsigned long max_new_nr_segs;
446         unsigned long seg_count;
447         unsigned long *seg_array;
448         struct iovec *iovecptr;
449         struct iovec *ptr;
450
451         total_count = 0;
452         ret = -EINVAL;
453         count = 0;
454         to_free = 0;
455
456         /* Compute total and max number of segments after split */
457         max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
458
459         gossip_debug(GOSSIP_FILE_DEBUG,
460                 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
461                 __func__,
462                 handle,
463                 (int)count);
464
465         if (type == PVFS_IO_WRITE) {
466                 gossip_debug(GOSSIP_FILE_DEBUG,
467                              "%s(%pU): proceeding with offset : %llu, "
468                              "size %d\n",
469                              __func__,
470                              handle,
471                              llu(*offset),
472                              (int)count);
473         }
474
475         if (count == 0) {
476                 ret = 0;
477                 goto out;
478         }
479
480         /*
481          * if the total size of data transfer requested is greater than
482          * the kernel-set blocksize of PVFS2, then we split the iovecs
483          * such that no iovec description straddles a block size limit
484          */
485
486         gossip_debug(GOSSIP_FILE_DEBUG,
487                      "%s: pvfs_bufmap_size:%d\n",
488                      __func__,
489                      pvfs_bufmap_size_query());
490
491         if (count > pvfs_bufmap_size_query()) {
492                 /*
493                  * Split up the given iovec description such that
494                  * no iovec descriptor straddles over the block-size limitation.
495                  * This makes us our job easier to stage the I/O.
496                  * In addition, this function will also compute an array
497                  * with seg_count entries that will store the number of
498                  * segments that straddle the block-size boundaries.
499                  */
500                 ret = split_iovecs(max_new_nr_segs,     /* IN */
501                                    nr_segs,             /* IN */
502                                    iov,                 /* IN */
503                                    &new_nr_segs,        /* OUT */
504                                    &iovecptr,           /* OUT */
505                                    &seg_count,          /* OUT */
506                                    &seg_array);         /* OUT */
507                 if (ret < 0) {
508                         gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
509                                 __func__,
510                                 ret);
511                         goto out;
512                 }
513                 gossip_debug(GOSSIP_FILE_DEBUG,
514                              "%s: Splitting iovecs from %lu to %lu"
515                              " [max_new %lu]\n",
516                              __func__,
517                              nr_segs,
518                              new_nr_segs,
519                              max_new_nr_segs);
520                 /* We must free seg_array and iovecptr */
521                 to_free = 1;
522         } else {
523                 new_nr_segs = nr_segs;
524                 /* use the given iovec description */
525                 iovecptr = (struct iovec *)iov;
526                 /* There is only 1 element in the seg_array */
527                 seg_count = 1;
528                 /* and its value is the number of segments passed in */
529                 seg_array = &nr_segs;
530                 /* We dont have to free up anything */
531                 to_free = 0;
532         }
533         ptr = iovecptr;
534
535         gossip_debug(GOSSIP_FILE_DEBUG,
536                      "%s(%pU) %zd@%llu\n",
537                      __func__,
538                      handle,
539                      count,
540                      llu(*offset));
541         gossip_debug(GOSSIP_FILE_DEBUG,
542                      "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
543                      __func__,
544                      handle,
545                      new_nr_segs, seg_count);
546
547 /* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
548 #ifdef PVFS2_KERNEL_DEBUG
549         for (seg = 0; seg < new_nr_segs; seg++)
550                 gossip_debug(GOSSIP_FILE_DEBUG,
551                              "%s: %d) %p to %p [%d bytes]\n",
552                              __func__,
553                              (int)seg + 1,
554                              iovecptr[seg].iov_base,
555                              iovecptr[seg].iov_base + iovecptr[seg].iov_len,
556                              (int)iovecptr[seg].iov_len);
557         for (seg = 0; seg < seg_count; seg++)
558                 gossip_debug(GOSSIP_FILE_DEBUG,
559                              "%s: %zd) %lu\n",
560                              __func__,
561                              seg + 1,
562                              seg_array[seg]);
563 #endif
564         seg = 0;
565         while (total_count < count) {
566                 size_t each_count;
567                 size_t amt_complete;
568
569                 /* how much to transfer in this loop iteration */
570                 each_count =
571                    (((count - total_count) > pvfs_bufmap_size_query()) ?
572                         pvfs_bufmap_size_query() :
573                         (count - total_count));
574
575                 gossip_debug(GOSSIP_FILE_DEBUG,
576                              "%s(%pU): size of each_count(%d)\n",
577                              __func__,
578                              handle,
579                              (int)each_count);
580                 gossip_debug(GOSSIP_FILE_DEBUG,
581                              "%s(%pU): BEFORE wait_for_io: offset is %d\n",
582                              __func__,
583                              handle,
584                              (int)*offset);
585
586                 ret = wait_for_direct_io(type, inode, offset, ptr,
587                                 seg_array[seg], each_count, 0);
588                 gossip_debug(GOSSIP_FILE_DEBUG,
589                              "%s(%pU): return from wait_for_io:%d\n",
590                              __func__,
591                              handle,
592                              (int)ret);
593
594                 if (ret < 0)
595                         goto out;
596
597                 /* advance the iovec pointer */
598                 ptr += seg_array[seg];
599                 seg++;
600                 *offset += ret;
601                 total_count += ret;
602                 amt_complete = ret;
603
604                 gossip_debug(GOSSIP_FILE_DEBUG,
605                              "%s(%pU): AFTER wait_for_io: offset is %d\n",
606                              __func__,
607                              handle,
608                              (int)*offset);
609
610                 /*
611                  * if we got a short I/O operations,
612                  * fall out and return what we got so far
613                  */
614                 if (amt_complete < each_count)
615                         break;
616         } /*end while */
617
618         if (total_count > 0)
619                 ret = total_count;
620 out:
621         if (to_free) {
622                 kfree(iovecptr);
623                 kfree(seg_array);
624         }
625         if (ret > 0) {
626                 if (type == PVFS_IO_READ) {
627                         file_accessed(file);
628                 } else {
629                         SetMtimeFlag(pvfs2_inode);
630                         inode->i_mtime = CURRENT_TIME;
631                         mark_inode_dirty_sync(inode);
632                 }
633         }
634
635         gossip_debug(GOSSIP_FILE_DEBUG,
636                      "%s(%pU): Value(%d) returned.\n",
637                      __func__,
638                      handle,
639                      (int)ret);
640
641         return ret;
642 }
643
644 /*
645  * Read data from a specified offset in a file (referenced by inode).
646  * Data may be placed either in a user or kernel buffer.
647  */
648 ssize_t pvfs2_inode_read(struct inode *inode,
649                          char __user *buf,
650                          size_t count,
651                          loff_t *offset,
652                          loff_t readahead_size)
653 {
654         struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
655         size_t bufmap_size;
656         struct iovec vec;
657         ssize_t ret = -EINVAL;
658
659         g_pvfs2_stats.reads++;
660
661         vec.iov_base = buf;
662         vec.iov_len = count;
663
664         bufmap_size = pvfs_bufmap_size_query();
665         if (count > bufmap_size) {
666                 gossip_debug(GOSSIP_FILE_DEBUG,
667                              "%s: count is too large (%zd/%zd)!\n",
668                              __func__, count, bufmap_size);
669                 return -EINVAL;
670         }
671
672         gossip_debug(GOSSIP_FILE_DEBUG,
673                      "%s(%pU) %zd@%llu\n",
674                      __func__,
675                      &pvfs2_inode->refn.khandle,
676                      count,
677                      llu(*offset));
678
679         ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
680                         count, readahead_size);
681         if (ret > 0)
682                 *offset += ret;
683
684         gossip_debug(GOSSIP_FILE_DEBUG,
685                      "%s(%pU): Value(%zd) returned.\n",
686                      __func__,
687                      &pvfs2_inode->refn.khandle,
688                      ret);
689
690         return ret;
691 }
692
693 static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
694 {
695         struct file *file = iocb->ki_filp;
696         loff_t pos = *(&iocb->ki_pos);
697         ssize_t rc = 0;
698         unsigned long nr_segs = iter->nr_segs;
699
700         BUG_ON(iocb->private);
701
702         gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
703
704         g_pvfs2_stats.reads++;
705
706         rc = do_readv_writev(PVFS_IO_READ,
707                              file,
708                              &pos,
709                              iter->iov,
710                              nr_segs);
711         iocb->ki_pos = pos;
712
713         return rc;
714 }
715
716 static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
717 {
718         struct file *file = iocb->ki_filp;
719         loff_t pos = *(&iocb->ki_pos);
720         unsigned long nr_segs = iter->nr_segs;
721         ssize_t rc;
722
723         BUG_ON(iocb->private);
724
725         gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
726
727         mutex_lock(&file->f_mapping->host->i_mutex);
728
729         /* Make sure generic_write_checks sees an up to date inode size. */
730         if (file->f_flags & O_APPEND) {
731                 rc = pvfs2_inode_getattr(file->f_mapping->host,
732                                          PVFS_ATTR_SYS_SIZE);
733                 if (rc) {
734                         gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
735                                    __func__, rc);
736                         goto out;
737                 }
738         }
739
740         if (file->f_pos > i_size_read(file->f_mapping->host))
741                 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
742
743         rc = generic_write_checks(iocb, iter);
744
745         if (rc <= 0) {
746                 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
747                            __func__, rc);
748                 goto out;
749         }
750
751         rc = do_readv_writev(PVFS_IO_WRITE,
752                              file,
753                              &pos,
754                              iter->iov,
755                              nr_segs);
756         if (rc < 0) {
757                 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
758                            __func__, rc);
759                 goto out;
760         }
761
762         iocb->ki_pos = pos;
763         g_pvfs2_stats.writes++;
764
765 out:
766
767         mutex_unlock(&file->f_mapping->host->i_mutex);
768         return rc;
769 }
770
771 /*
772  * Perform a miscellaneous operation on a file.
773  */
774 static long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
775 {
776         int ret = -ENOTTY;
777         __u64 val = 0;
778         unsigned long uval;
779
780         gossip_debug(GOSSIP_FILE_DEBUG,
781                      "pvfs2_ioctl: called with cmd %d\n",
782                      cmd);
783
784         /*
785          * we understand some general ioctls on files, such as the immutable
786          * and append flags
787          */
788         if (cmd == FS_IOC_GETFLAGS) {
789                 val = 0;
790                 ret = pvfs2_xattr_get_default(file->f_path.dentry,
791                                               "user.pvfs2.meta_hint",
792                                               &val,
793                                               sizeof(val),
794                                               0);
795                 if (ret < 0 && ret != -ENODATA)
796                         return ret;
797                 else if (ret == -ENODATA)
798                         val = 0;
799                 uval = val;
800                 gossip_debug(GOSSIP_FILE_DEBUG,
801                              "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
802                              (unsigned long long)uval);
803                 return put_user(uval, (int __user *)arg);
804         } else if (cmd == FS_IOC_SETFLAGS) {
805                 ret = 0;
806                 if (get_user(uval, (int __user *)arg))
807                         return -EFAULT;
808                 /*
809                  * PVFS_MIRROR_FL is set internally when the mirroring mode
810                  * is turned on for a file. The user is not allowed to turn
811                  * on this bit, but the bit is present if the user first gets
812                  * the flags and then updates the flags with some new
813                  * settings. So, we ignore it in the following edit. bligon.
814                  */
815                 if ((uval & ~PVFS_MIRROR_FL) &
816                     (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
817                         gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
818                         return -EINVAL;
819                 }
820                 val = uval;
821                 gossip_debug(GOSSIP_FILE_DEBUG,
822                              "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
823                              (unsigned long long)val);
824                 ret = pvfs2_xattr_set_default(file->f_path.dentry,
825                                               "user.pvfs2.meta_hint",
826                                               &val,
827                                               sizeof(val),
828                                               0,
829                                               0);
830         }
831
832         return ret;
833 }
834
835 /*
836  * Memory map a region of a file.
837  */
838 static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
839 {
840         gossip_debug(GOSSIP_FILE_DEBUG,
841                      "pvfs2_file_mmap: called on %s\n",
842                      (file ?
843                         (char *)file->f_path.dentry->d_name.name :
844                         (char *)"Unknown"));
845
846         /* set the sequential readahead hint */
847         vma->vm_flags |= VM_SEQ_READ;
848         vma->vm_flags &= ~VM_RAND_READ;
849
850         /* Use readonly mmap since we cannot support writable maps. */
851         return generic_file_readonly_mmap(file, vma);
852 }
853
854 #define mapping_nrpages(idata) ((idata)->nrpages)
855
856 /*
857  * Called to notify the module that there are no more references to
858  * this file (i.e. no processes have it open).
859  *
860  * \note Not called when each file is closed.
861  */
862 static int pvfs2_file_release(struct inode *inode, struct file *file)
863 {
864         gossip_debug(GOSSIP_FILE_DEBUG,
865                      "pvfs2_file_release: called on %s\n",
866                      file->f_path.dentry->d_name.name);
867
868         pvfs2_flush_inode(inode);
869
870         /*
871          * remove all associated inode pages from the page cache and mmap
872          * readahead cache (if any); this forces an expensive refresh of
873          * data for the next caller of mmap (or 'get_block' accesses)
874          */
875         if (file->f_path.dentry->d_inode &&
876             file->f_path.dentry->d_inode->i_mapping &&
877             mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
878                 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
879                                      0);
880         return 0;
881 }
882
883 /*
884  * Push all data for a specific file onto permanent storage.
885  */
886 static int pvfs2_fsync(struct file *file,
887                        loff_t start,
888                        loff_t end,
889                        int datasync)
890 {
891         int ret = -EINVAL;
892         struct pvfs2_inode_s *pvfs2_inode =
893                 PVFS2_I(file->f_path.dentry->d_inode);
894         struct pvfs2_kernel_op_s *new_op = NULL;
895
896         /* required call */
897         filemap_write_and_wait_range(file->f_mapping, start, end);
898
899         new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
900         if (!new_op)
901                 return -ENOMEM;
902         new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
903
904         ret = service_operation(new_op,
905                         "pvfs2_fsync",
906                         get_interruptible_flag(file->f_path.dentry->d_inode));
907
908         gossip_debug(GOSSIP_FILE_DEBUG,
909                      "pvfs2_fsync got return value of %d\n",
910                      ret);
911
912         op_release(new_op);
913
914         pvfs2_flush_inode(file->f_path.dentry->d_inode);
915         return ret;
916 }
917
918 /*
919  * Change the file pointer position for an instance of an open file.
920  *
921  * \note If .llseek is overriden, we must acquire lock as described in
922  *       Documentation/filesystems/Locking.
923  *
924  * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
925  * require much changes to the FS
926  */
927 static loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
928 {
929         int ret = -EINVAL;
930         struct inode *inode = file->f_path.dentry->d_inode;
931
932         if (!inode) {
933                 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
934                 return ret;
935         }
936
937         if (origin == PVFS2_SEEK_END) {
938                 /*
939                  * revalidate the inode's file size.
940                  * NOTE: We are only interested in file size here,
941                  * so we set mask accordingly.
942                  */
943                 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
944                 if (ret) {
945                         gossip_debug(GOSSIP_FILE_DEBUG,
946                                      "%s:%s:%d calling make bad inode\n",
947                                      __FILE__,
948                                      __func__,
949                                      __LINE__);
950                         pvfs2_make_bad_inode(inode);
951                         return ret;
952                 }
953         }
954
955         gossip_debug(GOSSIP_FILE_DEBUG,
956                      "pvfs2_file_llseek: offset is %ld | origin is %d"
957                      " | inode size is %lu\n",
958                      (long)offset,
959                      origin,
960                      (unsigned long)file->f_path.dentry->d_inode->i_size);
961
962         return generic_file_llseek(file, offset, origin);
963 }
964
965 /*
966  * Support local locks (locks that only this kernel knows about)
967  * if Orangefs was mounted -o local_lock.
968  */
969 static int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
970 {
971         int rc = -EINVAL;
972
973         if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
974                 if (cmd == F_GETLK) {
975                         rc = 0;
976                         posix_test_lock(filp, fl);
977                 } else {
978                         rc = posix_lock_file(filp, fl, NULL);
979                 }
980         }
981
982         return rc;
983 }
984
985 /** PVFS2 implementation of VFS file operations */
986 const struct file_operations pvfs2_file_operations = {
987         .llseek         = pvfs2_file_llseek,
988         .read_iter      = pvfs2_file_read_iter,
989         .write_iter     = pvfs2_file_write_iter,
990         .lock           = pvfs2_lock,
991         .unlocked_ioctl = pvfs2_ioctl,
992         .mmap           = pvfs2_file_mmap,
993         .open           = generic_file_open,
994         .release        = pvfs2_file_release,
995         .fsync          = pvfs2_fsync,
996 };